root/usr/src/uts/common/inet/ip/ip_dce.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
 * Copyright 2017, OmniTI Computer Consulting, Inc. All rights reserved.
 */

#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/zone.h>
#include <sys/ddi.h>
#include <sys/disp.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/atomic.h>
#include <sys/callb.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>

#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/snmpcom.h>

#include <netinet/ip6.h>
#include <netinet/icmp6.h>

#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_rts.h>
#include <inet/ip_ndp.h>
#include <inet/ipclassifier.h>
#include <inet/ip_listutils.h>

#include <sys/sunddi.h>

/*
 * Routines for handling destination cache entries.
 * There is always one DCEF_DEFAULT for each ip_stack_t created at init time.
 * That entry holds both the IP ident value and the dce generation number.
 *
 * Any time a DCE is changed significantly (different path MTU, but NOT
 * different ULP info!), the dce_generation number is increased.
 * Also, when a new DCE is created, the dce_generation number in the default
 * DCE is bumped. That allows the dce_t information to be cached efficiently
 * as long as the entity caching the dce_t also caches the dce_generation,
 * and compares the cached generation to detect any changes.
 * Furthermore, when a DCE is deleted, if there are any outstanding references
 * to the DCE it will be marked as condemned. The condemned mark is
 * a designated generation number which is never otherwise used, hence
 * the single comparison with the generation number captures that as well.
 *
 * An example of code which caches is as follows:
 *
 *      if (mystruct->my_dce_generation != mystruct->my_dce->dce_generation) {
 *              The DCE has changed
 *              mystruct->my_dce = dce_lookup_pkt(mp, ixa,
 *                  &mystruct->my_dce_generation);
 *              Not needed in practice, since we have the default DCE:
 *              if (DCE_IS_CONDEMNED(mystruct->my_dce))
 *                      return failure;
 *      }
 *
 * Note that for IPv6 link-local addresses we record the ifindex since the
 * link-locals are not globally unique.
 *
 * DCEs can remain for an arbitrarily long time, until memory pressure or
 * too-deep hash buckets (see dce_lookup_and_add*()) enable the reclaim thread
 * to actually remove DCEs from the cache.
 */

/*
 * Hash bucket structure for DCEs
 */
typedef struct dcb_s {
        krwlock_t       dcb_lock;
        uint32_t        dcb_cnt;
        dce_t           *dcb_dce;
} dcb_t;

static void     dce_delete_locked(dcb_t *, dce_t *);
static void     dce_make_condemned(dce_t *);

static kmem_cache_t *dce_cache;
static kthread_t *dce_reclaim_thread;
static kmutex_t dce_reclaim_lock;
static kcondvar_t dce_reclaim_cv;
static int dce_reclaim_shutdown;

/* Global so it can be tuned in /etc/system. This must be a power of two. */
uint_t ip_dce_hash_size = 1024;

/* The time in seconds between executions of the IP DCE reclaim worker. */
uint_t ip_dce_reclaim_interval = 60;

/* The factor of the DCE threshold at which to start hard reclaims */
uint_t ip_dce_reclaim_threshold_hard = 2;

/* Operates on a uint64_t */
#define RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))

/*
 * Reclaim a fraction of dce's in the dcb.
 * For now we have a higher probability to delete DCEs without DCE_PMTU.
 */
static void
dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
{
        uint_t  fraction_pmtu = fraction*4;
        uint_t  hash;
        dce_t   *dce, *nextdce;
        hrtime_t seed = gethrtime();
        uint_t  retained = 0;
        uint_t  max = ipst->ips_ip_dce_reclaim_threshold;

        max *= ip_dce_reclaim_threshold_hard;

        rw_enter(&dcb->dcb_lock, RW_WRITER);
        for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
                nextdce = dce->dce_next;
                /* Clear DCEF_PMTU if the pmtu is too old */
                mutex_enter(&dce->dce_lock);
                if ((dce->dce_flags & DCEF_PMTU) &&
                    TICK_TO_SEC(ddi_get_lbolt64()) - dce->dce_last_change_time >
                    ipst->ips_ip_pathmtu_interval) {
                        dce->dce_flags &= ~DCEF_PMTU;
                        mutex_exit(&dce->dce_lock);
                        dce_increment_generation(dce);
                } else {
                        mutex_exit(&dce->dce_lock);
                }

                if (max == 0 || retained < max) {
                        hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));

                        if (dce->dce_flags & DCEF_PMTU) {
                                if (hash % fraction_pmtu != 0) {
                                        retained++;
                                        continue;
                                }
                        } else {
                                if (hash % fraction != 0) {
                                        retained++;
                                        continue;
                                }
                        }
                }

                IP_STAT(ipst, ip_dce_reclaim_deleted);
                dce_delete_locked(dcb, dce);
                dce_refrele(dce);
        }
        rw_exit(&dcb->dcb_lock);
}

/*
 * kmem_cache callback to free up memory.
 *
 */
static void
ip_dce_reclaim_stack(ip_stack_t *ipst)
{
        int     i;

        IP_STAT(ipst, ip_dce_reclaim_calls);
        for (i = 0; i < ipst->ips_dce_hashsize; i++) {
                dcb_reclaim(&ipst->ips_dce_hash_v4[i], ipst,
                    ipst->ips_ip_dce_reclaim_fraction);

                dcb_reclaim(&ipst->ips_dce_hash_v6[i], ipst,
                    ipst->ips_ip_dce_reclaim_fraction);
        }

        /*
         * Walk all CONNs that can have a reference on an ire, nce or dce.
         * Get them to update any stale references to drop any refholds they
         * have.
         */
        ipcl_walk(conn_ixa_cleanup, (void *)B_FALSE, ipst);
}

/*
 * Called by dce_reclaim_worker() below, and no one else.  Typically this will
 * mean that the number of entries in the hash buckets has exceeded a tunable
 * threshold.
 */
static void
ip_dce_reclaim(void)
{
        netstack_handle_t nh;
        netstack_t *ns;
        ip_stack_t *ipst;

        ASSERT(curthread == dce_reclaim_thread);

        netstack_next_init(&nh);
        while ((ns = netstack_next(&nh)) != NULL) {
                /*
                 * netstack_next() can return a netstack_t with a NULL
                 * netstack_ip at boot time.
                 */
                if ((ipst = ns->netstack_ip) == NULL) {
                        netstack_rele(ns);
                        continue;
                }
                if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
                        ip_dce_reclaim_stack(ipst);
                netstack_rele(ns);
        }
        netstack_next_fini(&nh);
}

/* ARGSUSED */
static void
dce_reclaim_worker(void *arg)
{
        callb_cpr_t     cprinfo;

        CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
            "dce_reclaim_worker");

        mutex_enter(&dce_reclaim_lock);
        while (!dce_reclaim_shutdown) {
                CALLB_CPR_SAFE_BEGIN(&cprinfo);
                (void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
                    ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
                CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);

                if (dce_reclaim_shutdown)
                        break;

                mutex_exit(&dce_reclaim_lock);
                ip_dce_reclaim();
                mutex_enter(&dce_reclaim_lock);
        }

        ASSERT(MUTEX_HELD(&dce_reclaim_lock));
        dce_reclaim_thread = NULL;
        dce_reclaim_shutdown = 0;
        cv_broadcast(&dce_reclaim_cv);
        CALLB_CPR_EXIT(&cprinfo);       /* drops the lock */

        thread_exit();
}

void
dce_g_init(void)
{
        dce_cache = kmem_cache_create("dce_cache",
            sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);

        mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);

        dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
            NULL, 0, &p0, TS_RUN, minclsyspri);
}

void
dce_g_destroy(void)
{
        mutex_enter(&dce_reclaim_lock);
        dce_reclaim_shutdown = 1;
        cv_signal(&dce_reclaim_cv);
        while (dce_reclaim_thread != NULL)
                cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
        mutex_exit(&dce_reclaim_lock);

        cv_destroy(&dce_reclaim_cv);
        mutex_destroy(&dce_reclaim_lock);

        kmem_cache_destroy(dce_cache);
}

/*
 * Allocate a default DCE and a hash table for per-IP address DCEs
 */
void
dce_stack_init(ip_stack_t *ipst)
{
        int     i;

        ipst->ips_dce_default = kmem_cache_alloc(dce_cache, KM_SLEEP);
        bzero(ipst->ips_dce_default, sizeof (dce_t));
        ipst->ips_dce_default->dce_flags = DCEF_DEFAULT;
        ipst->ips_dce_default->dce_generation = DCE_GENERATION_INITIAL;
        ipst->ips_dce_default->dce_last_change_time =
            TICK_TO_SEC(ddi_get_lbolt64());
        ipst->ips_dce_default->dce_refcnt = 1;  /* Should never go away */
        ipst->ips_dce_default->dce_ipst = ipst;

        /* This must be a power of two since we are using IRE_ADDR_HASH macro */
        ipst->ips_dce_hashsize = ip_dce_hash_size;
        ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
            sizeof (dcb_t), KM_SLEEP);
        ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
            sizeof (dcb_t), KM_SLEEP);
        for (i = 0; i < ipst->ips_dce_hashsize; i++) {
                rw_init(&ipst->ips_dce_hash_v4[i].dcb_lock, NULL, RW_DEFAULT,
                    NULL);
                rw_init(&ipst->ips_dce_hash_v6[i].dcb_lock, NULL, RW_DEFAULT,
                    NULL);
        }
}

/*
 * Given a DCE hash bucket, unlink DCE entries from it. Some callers need
 * ifindex-specific matching, others don't. Don't overload ifindex to indicate
 * specificity, just indicate so explicitly.
 */
static void
dce_bucket_clean(dcb_t *dcb, boolean_t specific_ifindex, uint_t ifindex)
{
        dce_t   *dce, *nextdce;

        rw_enter(&dcb->dcb_lock, RW_WRITER);

        for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
                nextdce = dce->dce_next;
                if ((!specific_ifindex) || dce->dce_ifindex == ifindex) {
                        dce_delete_locked(dcb, dce);
                        dce_refrele(dce);
                }
        }

        rw_exit(&dcb->dcb_lock);
}

void
dce_stack_destroy(ip_stack_t *ipst)
{
        int i;
        for (i = 0; i < ipst->ips_dce_hashsize; i++) {
                dce_bucket_clean(&ipst->ips_dce_hash_v4[i], B_FALSE, 0);
                rw_destroy(&ipst->ips_dce_hash_v4[i].dcb_lock);
                dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_FALSE, 0);
                rw_destroy(&ipst->ips_dce_hash_v6[i].dcb_lock);
        }
        kmem_free(ipst->ips_dce_hash_v4,
            ipst->ips_dce_hashsize * sizeof (dcb_t));
        ipst->ips_dce_hash_v4 = NULL;
        kmem_free(ipst->ips_dce_hash_v6,
            ipst->ips_dce_hashsize * sizeof (dcb_t));
        ipst->ips_dce_hash_v6 = NULL;
        ipst->ips_dce_hashsize = 0;

        ASSERT(ipst->ips_dce_default->dce_refcnt == 1);
        kmem_cache_free(dce_cache, ipst->ips_dce_default);
        ipst->ips_dce_default = NULL;
}

/* When any DCE is good enough */
dce_t *
dce_get_default(ip_stack_t *ipst)
{
        dce_t           *dce;

        dce = ipst->ips_dce_default;
        dce_refhold(dce);
        return (dce);
}

/*
 * Generic for IPv4 and IPv6.
 *
 * Used by callers that need to cache e.g., the datapath
 * Returns the generation number in the last argument.
 */
dce_t *
dce_lookup_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp)
{
        if (ixa->ixa_flags & IXAF_IS_IPV4) {
                /*
                 * If we have a source route we need to look for the final
                 * destination in the source route option.
                 */
                ipaddr_t final_dst;
                ipha_t *ipha = (ipha_t *)mp->b_rptr;

                final_dst = ip_get_dst(ipha);
                return (dce_lookup_v4(final_dst, ixa->ixa_ipst, generationp));
        } else {
                uint_t ifindex;
                /*
                 * If we have a routing header we need to look for the final
                 * destination in the routing extension header.
                 */
                in6_addr_t final_dst;
                ip6_t *ip6h = (ip6_t *)mp->b_rptr;

                final_dst = ip_get_dst_v6(ip6h, mp, NULL);
                ifindex = 0;
                if (IN6_IS_ADDR_LINKSCOPE(&final_dst) && ixa->ixa_nce != NULL) {
                        ifindex = ixa->ixa_nce->nce_common->ncec_ill->
                            ill_phyint->phyint_ifindex;
                }
                return (dce_lookup_v6(&final_dst, ifindex, ixa->ixa_ipst,
                    generationp));
        }
}

/*
 * Used by callers that need to cache e.g., the datapath
 * Returns the generation number in the last argument.
 */
dce_t *
dce_lookup_v4(ipaddr_t dst, ip_stack_t *ipst, uint_t *generationp)
{
        uint_t          hash;
        dcb_t           *dcb;
        dce_t           *dce;

        /* Set *generationp before dropping the lock(s) that allow additions */
        if (generationp != NULL)
                *generationp = ipst->ips_dce_default->dce_generation;

        hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
        dcb = &ipst->ips_dce_hash_v4[hash];
        rw_enter(&dcb->dcb_lock, RW_READER);
        for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                if (dce->dce_v4addr == dst) {
                        mutex_enter(&dce->dce_lock);
                        if (!DCE_IS_CONDEMNED(dce)) {
                                dce_refhold(dce);
                                if (generationp != NULL)
                                        *generationp = dce->dce_generation;
                                mutex_exit(&dce->dce_lock);
                                rw_exit(&dcb->dcb_lock);
                                return (dce);
                        }
                        mutex_exit(&dce->dce_lock);
                }
        }
        rw_exit(&dcb->dcb_lock);
        /* Not found */
        dce = ipst->ips_dce_default;
        dce_refhold(dce);
        return (dce);
}

/*
 * Used by callers that need to cache e.g., the datapath
 * Returns the generation number in the last argument.
 * ifindex should only be set for link-locals
 */
dce_t *
dce_lookup_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst,
    uint_t *generationp)
{
        uint_t          hash;
        dcb_t           *dcb;
        dce_t           *dce;

        /* Set *generationp before dropping the lock(s) that allow additions */
        if (generationp != NULL)
                *generationp = ipst->ips_dce_default->dce_generation;

        hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
        dcb = &ipst->ips_dce_hash_v6[hash];
        rw_enter(&dcb->dcb_lock, RW_READER);
        for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
                    dce->dce_ifindex == ifindex) {
                        mutex_enter(&dce->dce_lock);
                        if (!DCE_IS_CONDEMNED(dce)) {
                                dce_refhold(dce);
                                if (generationp != NULL)
                                        *generationp = dce->dce_generation;
                                mutex_exit(&dce->dce_lock);
                                rw_exit(&dcb->dcb_lock);
                                return (dce);
                        }
                        mutex_exit(&dce->dce_lock);
                }
        }
        rw_exit(&dcb->dcb_lock);
        /* Not found */
        dce = ipst->ips_dce_default;
        dce_refhold(dce);
        return (dce);
}

/*
 * Atomically looks for a non-default DCE, and if not found tries to create one.
 * If there is no memory it returns NULL.
 * When an entry is created we increase the generation number on
 * the default DCE so that conn_ip_output will detect there is a new DCE.
 */
dce_t *
dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
{
        uint_t          hash;
        dcb_t           *dcb;
        dce_t           *dce;

        hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
        dcb = &ipst->ips_dce_hash_v4[hash];
        /*
         * Assuming that we get fairly even distribution across all of the
         * buckets, once one bucket is overly full, prune the whole cache.
         */
        if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
                atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
        rw_enter(&dcb->dcb_lock, RW_WRITER);
        for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                if (dce->dce_v4addr == dst) {
                        mutex_enter(&dce->dce_lock);
                        if (!DCE_IS_CONDEMNED(dce)) {
                                dce_refhold(dce);
                                mutex_exit(&dce->dce_lock);
                                rw_exit(&dcb->dcb_lock);
                                return (dce);
                        }
                        mutex_exit(&dce->dce_lock);
                }
        }
        dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
        if (dce == NULL) {
                rw_exit(&dcb->dcb_lock);
                return (NULL);
        }
        bzero(dce, sizeof (dce_t));
        dce->dce_ipst = ipst;   /* No netstack_hold */
        dce->dce_v4addr = dst;
        dce->dce_generation = DCE_GENERATION_INITIAL;
        dce->dce_ipversion = IPV4_VERSION;
        dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
        dce_refhold(dce);       /* For the hash list */

        /* Link into list */
        if (dcb->dcb_dce != NULL)
                dcb->dcb_dce->dce_ptpn = &dce->dce_next;
        dce->dce_next = dcb->dcb_dce;
        dce->dce_ptpn = &dcb->dcb_dce;
        dcb->dcb_dce = dce;
        dce->dce_bucket = dcb;
        atomic_inc_32(&dcb->dcb_cnt);
        dce_refhold(dce);       /* For the caller */
        rw_exit(&dcb->dcb_lock);

        /* Initialize dce_ident to be different than for the last packet */
        dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;

        dce_increment_generation(ipst->ips_dce_default);
        return (dce);
}

/*
 * Atomically looks for a non-default DCE, and if not found tries to create one.
 * If there is no memory it returns NULL.
 * When an entry is created we increase the generation number on
 * the default DCE so that conn_ip_output will detect there is a new DCE.
 * ifindex should only be used with link-local addresses.
 */
dce_t *
dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
{
        uint_t          hash;
        dcb_t           *dcb;
        dce_t           *dce;

        /* We should not create entries for link-locals w/o an ifindex */
        ASSERT(!(IN6_IS_ADDR_LINKSCOPE(dst)) || ifindex != 0);

        hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
        dcb = &ipst->ips_dce_hash_v6[hash];
        /*
         * Assuming that we get fairly even distribution across all of the
         * buckets, once one bucket is overly full, prune the whole cache.
         */
        if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
                atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
        rw_enter(&dcb->dcb_lock, RW_WRITER);
        for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
                    dce->dce_ifindex == ifindex) {
                        mutex_enter(&dce->dce_lock);
                        if (!DCE_IS_CONDEMNED(dce)) {
                                dce_refhold(dce);
                                mutex_exit(&dce->dce_lock);
                                rw_exit(&dcb->dcb_lock);
                                return (dce);
                        }
                        mutex_exit(&dce->dce_lock);
                }
        }

        dce = kmem_cache_alloc(dce_cache, KM_NOSLEEP);
        if (dce == NULL) {
                rw_exit(&dcb->dcb_lock);
                return (NULL);
        }
        bzero(dce, sizeof (dce_t));
        dce->dce_ipst = ipst;   /* No netstack_hold */
        dce->dce_v6addr = *dst;
        dce->dce_ifindex = ifindex;
        dce->dce_generation = DCE_GENERATION_INITIAL;
        dce->dce_ipversion = IPV6_VERSION;
        dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
        dce_refhold(dce);       /* For the hash list */

        /* Link into list */
        if (dcb->dcb_dce != NULL)
                dcb->dcb_dce->dce_ptpn = &dce->dce_next;
        dce->dce_next = dcb->dcb_dce;
        dce->dce_ptpn = &dcb->dcb_dce;
        dcb->dcb_dce = dce;
        dce->dce_bucket = dcb;
        atomic_inc_32(&dcb->dcb_cnt);
        dce_refhold(dce);       /* For the caller */
        rw_exit(&dcb->dcb_lock);

        /* Initialize dce_ident to be different than for the last packet */
        dce->dce_ident = ipst->ips_dce_default->dce_ident + 1;
        dce_increment_generation(ipst->ips_dce_default);
        return (dce);
}

/*
 * Set/update uinfo. Creates a per-destination dce if none exists.
 *
 * Note that we do not bump the generation number here.
 * New connections will find the new uinfo.
 *
 * The only use of this (tcp, sctp using iulp_t) is to set rtt+rtt_sd.
 */
static void
dce_setuinfo(dce_t *dce, iulp_t *uinfo)
{
        /*
         * Update the round trip time estimate and/or the max frag size
         * and/or the slow start threshold.
         *
         * We serialize multiple advises using dce_lock.
         */
        mutex_enter(&dce->dce_lock);
        /* Gard against setting to zero */
        if (uinfo->iulp_rtt != 0) {
                /*
                 * If there is no old cached values, initialize them
                 * conservatively.  Set them to be (1.5 * new value).
                 */
                if (dce->dce_uinfo.iulp_rtt != 0) {
                        dce->dce_uinfo.iulp_rtt = (dce->dce_uinfo.iulp_rtt +
                            uinfo->iulp_rtt) >> 1;
                } else {
                        dce->dce_uinfo.iulp_rtt = uinfo->iulp_rtt +
                            (uinfo->iulp_rtt >> 1);
                }
                if (dce->dce_uinfo.iulp_rtt_sd != 0) {
                        dce->dce_uinfo.iulp_rtt_sd =
                            (dce->dce_uinfo.iulp_rtt_sd +
                            uinfo->iulp_rtt_sd) >> 1;
                } else {
                        dce->dce_uinfo.iulp_rtt_sd = uinfo->iulp_rtt_sd +
                            (uinfo->iulp_rtt_sd >> 1);
                }
        }
        if (uinfo->iulp_mtu != 0) {
                if (dce->dce_flags & DCEF_PMTU) {
                        dce->dce_pmtu = MIN(uinfo->iulp_mtu, dce->dce_pmtu);
                } else {
                        dce->dce_pmtu = MIN(uinfo->iulp_mtu, IP_MAXPACKET);
                        dce->dce_flags |= DCEF_PMTU;
                }
                dce->dce_last_change_time = TICK_TO_SEC(ddi_get_lbolt64());
        }
        if (uinfo->iulp_ssthresh != 0) {
                if (dce->dce_uinfo.iulp_ssthresh != 0)
                        dce->dce_uinfo.iulp_ssthresh =
                            (uinfo->iulp_ssthresh +
                            dce->dce_uinfo.iulp_ssthresh) >> 1;
                else
                        dce->dce_uinfo.iulp_ssthresh = uinfo->iulp_ssthresh;
        }
        /* We have uinfo for sure */
        dce->dce_flags |= DCEF_UINFO;
        mutex_exit(&dce->dce_lock);
}


int
dce_update_uinfo_v4(ipaddr_t dst, iulp_t *uinfo, ip_stack_t *ipst)
{
        dce_t *dce;

        dce = dce_lookup_and_add_v4(dst, ipst);
        if (dce == NULL)
                return (ENOMEM);

        dce_setuinfo(dce, uinfo);
        dce_refrele(dce);
        return (0);
}

int
dce_update_uinfo_v6(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
    ip_stack_t *ipst)
{
        dce_t *dce;

        dce = dce_lookup_and_add_v6(dst, ifindex, ipst);
        if (dce == NULL)
                return (ENOMEM);

        dce_setuinfo(dce, uinfo);
        dce_refrele(dce);
        return (0);
}

/* Common routine for IPv4 and IPv6 */
int
dce_update_uinfo(const in6_addr_t *dst, uint_t ifindex, iulp_t *uinfo,
    ip_stack_t *ipst)
{
        ipaddr_t dst4;

        if (IN6_IS_ADDR_V4MAPPED_ANY(dst)) {
                IN6_V4MAPPED_TO_IPADDR(dst, dst4);
                return (dce_update_uinfo_v4(dst4, uinfo, ipst));
        } else {
                return (dce_update_uinfo_v6(dst, ifindex, uinfo, ipst));
        }
}

static void
dce_make_condemned(dce_t *dce)
{
        ip_stack_t      *ipst = dce->dce_ipst;

        mutex_enter(&dce->dce_lock);
        ASSERT(!DCE_IS_CONDEMNED(dce));
        dce->dce_generation = DCE_GENERATION_CONDEMNED;
        mutex_exit(&dce->dce_lock);
        /* Count how many condemned dces for kmem_cache callback */
        atomic_inc_32(&ipst->ips_num_dce_condemned);
}

/*
 * Increment the generation avoiding the special condemned value
 */
void
dce_increment_generation(dce_t *dce)
{
        uint_t generation;

        mutex_enter(&dce->dce_lock);
        if (!DCE_IS_CONDEMNED(dce)) {
                generation = dce->dce_generation + 1;
                if (generation == DCE_GENERATION_CONDEMNED)
                        generation = DCE_GENERATION_INITIAL;
                ASSERT(generation != DCE_GENERATION_VERIFY);
                dce->dce_generation = generation;
        }
        mutex_exit(&dce->dce_lock);
}

/*
 * Increment the generation number on all dces that have a path MTU and
 * the default DCE. Used when ill_mtu or ill_mc_mtu changes.
 */
void
dce_increment_all_generations(boolean_t isv6, ip_stack_t *ipst)
{
        int             i;
        dcb_t           *dcb;
        dce_t           *dce;

        for (i = 0; i < ipst->ips_dce_hashsize; i++) {
                if (isv6)
                        dcb = &ipst->ips_dce_hash_v6[i];
                else
                        dcb = &ipst->ips_dce_hash_v4[i];
                rw_enter(&dcb->dcb_lock, RW_WRITER);
                for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                        if (DCE_IS_CONDEMNED(dce))
                                continue;
                        dce_increment_generation(dce);
                }
                rw_exit(&dcb->dcb_lock);
        }
        dce_increment_generation(ipst->ips_dce_default);
}

/*
 * Caller needs to do a dce_refrele since we can't do the
 * dce_refrele under dcb_lock.
 */
static void
dce_delete_locked(dcb_t *dcb, dce_t *dce)
{
        dce->dce_bucket = NULL;
        *dce->dce_ptpn = dce->dce_next;
        if (dce->dce_next != NULL)
                dce->dce_next->dce_ptpn = dce->dce_ptpn;
        dce->dce_ptpn = NULL;
        dce->dce_next = NULL;
        atomic_dec_32(&dcb->dcb_cnt);
        dce_make_condemned(dce);
}

static void
dce_inactive(dce_t *dce)
{
        ip_stack_t      *ipst = dce->dce_ipst;

        ASSERT(!(dce->dce_flags & DCEF_DEFAULT));
        ASSERT(dce->dce_ptpn == NULL);
        ASSERT(dce->dce_bucket == NULL);

        /* Count how many condemned dces for kmem_cache callback */
        if (DCE_IS_CONDEMNED(dce))
                atomic_dec_32(&ipst->ips_num_dce_condemned);

        kmem_cache_free(dce_cache, dce);
}

void
dce_refrele(dce_t *dce)
{
        ASSERT(dce->dce_refcnt != 0);
        if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
                dce_inactive(dce);
}

void
dce_refhold(dce_t *dce)
{
        atomic_inc_32(&dce->dce_refcnt);
        ASSERT(dce->dce_refcnt != 0);
}

/* No tracing support yet hence the same as the above functions */
void
dce_refrele_notr(dce_t *dce)
{
        ASSERT(dce->dce_refcnt != 0);
        if (atomic_dec_32_nv(&dce->dce_refcnt) == 0)
                dce_inactive(dce);
}

void
dce_refhold_notr(dce_t *dce)
{
        atomic_inc_32(&dce->dce_refcnt);
        ASSERT(dce->dce_refcnt != 0);
}

/* Report both the IPv4 and IPv6 DCEs. */
mblk_t *
ip_snmp_get_mib2_ip_dce(queue_t *q, mblk_t *mpctl, ip_stack_t *ipst)
{
        struct opthdr           *optp;
        mblk_t                  *mp2ctl;
        dest_cache_entry_t      dest_cache;
        mblk_t                  *mp_tail = NULL;
        dce_t                   *dce;
        dcb_t                   *dcb;
        int                     i;
        uint64_t                current_time;

        current_time = TICK_TO_SEC(ddi_get_lbolt64());

        /*
         * make a copy of the original message
         */
        mp2ctl = copymsg(mpctl);

        /* First we do IPv4 entries */
        optp = (struct opthdr *)&mpctl->b_rptr[
            sizeof (struct T_optmgmt_ack)];
        optp->level = MIB2_IP;
        optp->name = EXPER_IP_DCE;

        for (i = 0; i < ipst->ips_dce_hashsize; i++) {
                dcb = &ipst->ips_dce_hash_v4[i];
                rw_enter(&dcb->dcb_lock, RW_READER);
                for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                        dest_cache.DestIpv4Address = dce->dce_v4addr;
                        dest_cache.DestFlags = dce->dce_flags;
                        if (dce->dce_flags & DCEF_PMTU)
                                dest_cache.DestPmtu = dce->dce_pmtu;
                        else
                                dest_cache.DestPmtu = 0;
                        dest_cache.DestIdent = dce->dce_ident;
                        dest_cache.DestIfindex = 0;
                        dest_cache.DestAge = current_time -
                            dce->dce_last_change_time;
                        if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
                            (char *)&dest_cache, (int)sizeof (dest_cache))) {
                                ip1dbg(("ip_snmp_get_mib2_ip_dce: "
                                    "failed to allocate %u bytes\n",
                                    (uint_t)sizeof (dest_cache)));
                        }
                }
                rw_exit(&dcb->dcb_lock);
        }
        optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
        ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
            (int)optp->level, (int)optp->name, (int)optp->len));
        qreply(q, mpctl);

        if (mp2ctl == NULL) {
                /* Copymsg failed above */
                return (NULL);
        }

        /* Now for IPv6 */
        mpctl = mp2ctl;
        mp_tail = NULL;
        mp2ctl = copymsg(mpctl);
        optp = (struct opthdr *)&mpctl->b_rptr[
            sizeof (struct T_optmgmt_ack)];
        optp->level = MIB2_IP6;
        optp->name = EXPER_IP_DCE;

        for (i = 0; i < ipst->ips_dce_hashsize; i++) {
                dcb = &ipst->ips_dce_hash_v6[i];
                rw_enter(&dcb->dcb_lock, RW_READER);
                for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
                        dest_cache.DestIpv6Address = dce->dce_v6addr;
                        dest_cache.DestFlags = dce->dce_flags;
                        if (dce->dce_flags & DCEF_PMTU)
                                dest_cache.DestPmtu = dce->dce_pmtu;
                        else
                                dest_cache.DestPmtu = 0;
                        dest_cache.DestIdent = dce->dce_ident;
                        if (IN6_IS_ADDR_LINKSCOPE(&dce->dce_v6addr))
                                dest_cache.DestIfindex = dce->dce_ifindex;
                        else
                                dest_cache.DestIfindex = 0;
                        dest_cache.DestAge = current_time -
                            dce->dce_last_change_time;
                        if (!snmp_append_data2(mpctl->b_cont, &mp_tail,
                            (char *)&dest_cache, (int)sizeof (dest_cache))) {
                                ip1dbg(("ip_snmp_get_mib2_ip_dce: "
                                    "failed to allocate %u bytes\n",
                                    (uint_t)sizeof (dest_cache)));
                        }
                }
                rw_exit(&dcb->dcb_lock);
        }
        optp->len = (t_uscalar_t)msgdsize(mpctl->b_cont);
        ip3dbg(("ip_snmp_get: level %d, name %d, len %d\n",
            (int)optp->level, (int)optp->name, (int)optp->len));
        qreply(q, mpctl);

        return (mp2ctl);
}

/*
 * Remove IPv6 DCEs which refer to an ifindex that is going away.
 * This is not required for correctness, but it avoids netstat -d
 * showing stale stuff that will never be used.
 */
void
dce_cleanup(uint_t ifindex, ip_stack_t *ipst)
{
        uint_t  i;

        for (i = 0; i < ipst->ips_dce_hashsize; i++)
                dce_bucket_clean(&ipst->ips_dce_hash_v6[i], B_TRUE, ifindex);
}