root/usr/src/uts/common/inet/ip/ip_attr.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
 */
/* Copyright (c) 1990 Mentat Inc. */

/*
 * Copyright 2019 Joyent, Inc.
 * Copyright 2024 Oxide Computer Company
 */

#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/zone.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/atomic.h>

#include <sys/systm.h>
#include <sys/param.h>
#include <sys/kmem.h>
#include <sys/sdt.h>
#include <sys/socket.h>
#include <sys/mac.h>
#include <net/if.h>
#include <net/if_arp.h>
#include <net/route.h>
#include <sys/sockio.h>
#include <netinet/in.h>
#include <net/if_dl.h>

#include <inet/common.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/arp.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>

#include <netinet/igmp_var.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <netinet/sctp.h>

#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip6_asp.h>
#include <inet/tcp.h>
#include <inet/ip_multi.h>
#include <inet/ip_if.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_rts.h>
#include <inet/optcom.h>
#include <inet/ip_ndp.h>
#include <inet/ip_listutils.h>
#include <netinet/igmp.h>
#include <netinet/ip_mroute.h>
#include <inet/ipp_common.h>

#include <net/pfkeyv2.h>
#include <inet/sadb.h>
#include <inet/ipsec_impl.h>
#include <inet/ipdrop.h>
#include <inet/ip_netinfo.h>
#include <sys/squeue_impl.h>
#include <sys/squeue.h>

#include <inet/ipclassifier.h>
#include <inet/sctp_ip.h>
#include <inet/sctp/sctp_impl.h>
#include <inet/udp_impl.h>
#include <sys/sunddi.h>

#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>

/*
 * Release a reference on ip_xmit_attr.
 * The reference is acquired by conn_get_ixa()
 *
 * This macro has a lowercase function-call version for callers outside
 * this file.
 */
#define IXA_REFRELE(ixa)                                        \
{                                                               \
        if (atomic_dec_32_nv(&(ixa)->ixa_refcnt) == 0)  \
                ixa_inactive(ixa);                              \
}

#define IXA_REFHOLD(ixa)                                        \
{                                                               \
        ASSERT3U((ixa)->ixa_refcnt, !=, 0);                     \
        atomic_inc_32(&(ixa)->ixa_refcnt);                      \
}

/*
 * When we need to handle a transmit side asynchronous operation, then we need
 * to save sufficient information so that we can call the fragment and postfrag
 * functions. That information is captured in an mblk containing this structure.
 *
 * Since this is currently only used for IPsec, we include information for
 * the kernel crypto framework.
 */
typedef struct ixamblk_s {
        boolean_t       ixm_inbound;    /* B_FALSE */
        iaflags_t       ixm_flags;      /* ixa_flags */
        netstackid_t    ixm_stackid;    /* Verify it didn't go away */
        uint_t          ixm_ifindex;    /* Used to find the nce */
        in6_addr_t      ixm_nceaddr_v6; /* Used to find nce */
#define ixm_nceaddr_v4  V4_PART_OF_V6(ixm_nceaddr_v6)
        uint32_t        ixm_fragsize;
        uint_t          ixm_pktlen;
        uint16_t        ixm_ip_hdr_length; /* Points to ULP header */
        uint8_t         ixm_protocol;   /* Protocol number for ULP cksum */
        pfirepostfrag_t ixm_postfragfn;

        zoneid_t        ixm_zoneid;             /* Needed for ipobs */
        zoneid_t        ixm_no_loop_zoneid;     /* IXAF_NO_LOOP_ZONEID_SET */

        uint_t          ixm_scopeid;            /* For IPv6 link-locals */

        uint32_t        ixm_ident;              /* For IPv6 fragment header */
        uint32_t        ixm_xmit_hint;

        uint64_t        ixm_conn_id;            /* Used by DTrace */
        cred_t          *ixm_cred;      /* For getpeerucred - refhold if set */
        pid_t           ixm_cpid;       /* For getpeerucred */

        ts_label_t      *ixm_tsl;       /* Refhold if set. */

        /*
         * When the pointers below are set they have a refhold on the struct.
         */
        ipsec_latch_t           *ixm_ipsec_latch;
        struct ipsa_s           *ixm_ipsec_ah_sa;       /* SA for AH */
        struct ipsa_s           *ixm_ipsec_esp_sa;      /* SA for ESP */
        struct ipsec_policy_s   *ixm_ipsec_policy;      /* why are we here? */
        struct ipsec_action_s   *ixm_ipsec_action; /* For reflected packets */

        ipsa_ref_t              ixm_ipsec_ref[2]; /* Soft reference to SA */

        /* Need these while waiting for SA */
        uint16_t ixm_ipsec_src_port;    /* Source port number of d-gram. */
        uint16_t ixm_ipsec_dst_port;    /* Destination port number of d-gram. */
        uint8_t  ixm_ipsec_icmp_type;   /* ICMP type of d-gram */
        uint8_t  ixm_ipsec_icmp_code;   /* ICMP code of d-gram */

        sa_family_t ixm_ipsec_inaf;     /* Inner address family */
        uint32_t ixm_ipsec_insrc[IXA_MAX_ADDRLEN];      /* Inner src address */
        uint32_t ixm_ipsec_indst[IXA_MAX_ADDRLEN];      /* Inner dest address */
        uint8_t  ixm_ipsec_insrcpfx;    /* Inner source prefix */
        uint8_t  ixm_ipsec_indstpfx;    /* Inner destination prefix */

        uint8_t ixm_ipsec_proto;        /* IP protocol number for d-gram. */
} ixamblk_t;


/*
 * When we need to handle a receive side asynchronous operation, then we need
 * to save sufficient information so that we can call ip_fanout.
 * That information is captured in an mblk containing this structure.
 *
 * Since this is currently only used for IPsec, we include information for
 * the kernel crypto framework.
 */
typedef struct iramblk_s {
        boolean_t       irm_inbound;    /* B_TRUE */
        iaflags_t       irm_flags;      /* ira_flags */
        netstackid_t    irm_stackid;    /* Verify it didn't go away */
        uint_t          irm_ifindex;    /* To find ira_ill */

        uint_t          irm_rifindex;   /* ira_rifindex */
        uint_t          irm_ruifindex;  /* ira_ruifindex */
        uint_t          irm_pktlen;
        uint16_t        irm_ip_hdr_length; /* Points to ULP header */
        uint8_t         irm_protocol;   /* Protocol number for ULP cksum */
        uint8_t         irm_ttl;        /* IP TTL, IPv6 hop limit */
        zoneid_t        irm_zoneid;     /* ALL_ZONES unless local delivery */

        squeue_t        *irm_sqp;
        ill_rx_ring_t   *irm_ring;

        ipaddr_t        irm_mroute_tunnel;      /* IRAF_MROUTE_TUNNEL_SET */
        zoneid_t        irm_no_loop_zoneid;     /* IRAF_NO_LOOP_ZONEID_SET */
        uint32_t        irm_esp_udp_ports;      /* IRAF_ESP_UDP_PORTS */

        char            irm_l2src[IRA_L2SRC_SIZE];      /* If IRAF_L2SRC_SET */

        cred_t          *irm_cred;      /* For getpeerucred - refhold if set */
        pid_t           irm_cpid;       /* For getpeerucred */

        ts_label_t      *irm_tsl;       /* Refhold if set. */

        /*
         * When set these correspond to a refhold on the object.
         */
        struct ipsa_s           *irm_ipsec_ah_sa;       /* SA for AH */
        struct ipsa_s           *irm_ipsec_esp_sa;      /* SA for ESP */
        struct ipsec_action_s   *irm_ipsec_action; /* For reflected packets */
} iramblk_t;


/*
 * Take the information in ip_xmit_attr_t and stick it in an mblk
 * that can later be passed to ip_xmit_attr_from_mblk to recreate the
 * ip_xmit_attr_t.
 *
 * Returns NULL on memory allocation failure.
 */
mblk_t *
ip_xmit_attr_to_mblk(ip_xmit_attr_t *ixa)
{
        mblk_t          *ixamp;
        ixamblk_t       *ixm;
        nce_t           *nce = ixa->ixa_nce;

        ASSERT(nce != NULL);
        ixamp = allocb(sizeof (*ixm), BPRI_MED);
        if (ixamp == NULL)
                return (NULL);

        ixamp->b_datap->db_type = M_BREAK;
        ixamp->b_wptr += sizeof (*ixm);
        ixm = (ixamblk_t *)ixamp->b_rptr;

        bzero(ixm, sizeof (*ixm));
        ixm->ixm_inbound = B_FALSE;
        ixm->ixm_flags = ixa->ixa_flags;
        ixm->ixm_stackid = ixa->ixa_ipst->ips_netstack->netstack_stackid;
        ixm->ixm_ifindex = nce->nce_ill->ill_phyint->phyint_ifindex;
        ixm->ixm_nceaddr_v6 = nce->nce_addr;
        ixm->ixm_fragsize = ixa->ixa_fragsize;
        ixm->ixm_pktlen = ixa->ixa_pktlen;
        ixm->ixm_ip_hdr_length = ixa->ixa_ip_hdr_length;
        ixm->ixm_protocol = ixa->ixa_protocol;
        ixm->ixm_postfragfn = ixa->ixa_postfragfn;
        ixm->ixm_zoneid = ixa->ixa_zoneid;
        ixm->ixm_no_loop_zoneid = ixa->ixa_no_loop_zoneid;
        ixm->ixm_scopeid = ixa->ixa_scopeid;
        ixm->ixm_ident = ixa->ixa_ident;
        ixm->ixm_xmit_hint = ixa->ixa_xmit_hint;

        if (ixa->ixa_tsl != NULL) {
                ixm->ixm_tsl = ixa->ixa_tsl;
                label_hold(ixm->ixm_tsl);
        }
        if (ixa->ixa_cred != NULL) {
                ixm->ixm_cred = ixa->ixa_cred;
                crhold(ixa->ixa_cred);
        }
        ixm->ixm_cpid = ixa->ixa_cpid;
        ixm->ixm_conn_id = ixa->ixa_conn_id;

        if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
                if (ixa->ixa_ipsec_ah_sa != NULL) {
                        ixm->ixm_ipsec_ah_sa = ixa->ixa_ipsec_ah_sa;
                        IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
                }
                if (ixa->ixa_ipsec_esp_sa != NULL) {
                        ixm->ixm_ipsec_esp_sa = ixa->ixa_ipsec_esp_sa;
                        IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
                }
                if (ixa->ixa_ipsec_policy != NULL) {
                        ixm->ixm_ipsec_policy = ixa->ixa_ipsec_policy;
                        IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
                }
                if (ixa->ixa_ipsec_action != NULL) {
                        ixm->ixm_ipsec_action = ixa->ixa_ipsec_action;
                        IPACT_REFHOLD(ixa->ixa_ipsec_action);
                }
                if (ixa->ixa_ipsec_latch != NULL) {
                        ixm->ixm_ipsec_latch = ixa->ixa_ipsec_latch;
                        IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
                }
                ixm->ixm_ipsec_ref[0] = ixa->ixa_ipsec_ref[0];
                ixm->ixm_ipsec_ref[1] = ixa->ixa_ipsec_ref[1];
                ixm->ixm_ipsec_src_port = ixa->ixa_ipsec_src_port;
                ixm->ixm_ipsec_dst_port = ixa->ixa_ipsec_dst_port;
                ixm->ixm_ipsec_icmp_type = ixa->ixa_ipsec_icmp_type;
                ixm->ixm_ipsec_icmp_code = ixa->ixa_ipsec_icmp_code;
                ixm->ixm_ipsec_inaf = ixa->ixa_ipsec_inaf;
                ixm->ixm_ipsec_insrc[0] = ixa->ixa_ipsec_insrc[0];
                ixm->ixm_ipsec_insrc[1] = ixa->ixa_ipsec_insrc[1];
                ixm->ixm_ipsec_insrc[2] = ixa->ixa_ipsec_insrc[2];
                ixm->ixm_ipsec_insrc[3] = ixa->ixa_ipsec_insrc[3];
                ixm->ixm_ipsec_indst[0] = ixa->ixa_ipsec_indst[0];
                ixm->ixm_ipsec_indst[1] = ixa->ixa_ipsec_indst[1];
                ixm->ixm_ipsec_indst[2] = ixa->ixa_ipsec_indst[2];
                ixm->ixm_ipsec_indst[3] = ixa->ixa_ipsec_indst[3];
                ixm->ixm_ipsec_insrcpfx = ixa->ixa_ipsec_insrcpfx;
                ixm->ixm_ipsec_indstpfx = ixa->ixa_ipsec_indstpfx;
                ixm->ixm_ipsec_proto = ixa->ixa_ipsec_proto;
        }
        return (ixamp);
}

/*
 * Extract the ip_xmit_attr_t from the mblk, checking that the
 * ip_stack_t, ill_t, and nce_t still exist. Returns B_FALSE if that is
 * not the case.
 *
 * Otherwise ixa is updated.
 * Caller needs to release references on the ixa by calling ixa_refrele()
 * which will imediately call ixa_inactive to release the references.
 */
boolean_t
ip_xmit_attr_from_mblk(mblk_t *ixamp, ip_xmit_attr_t *ixa)
{
        ixamblk_t       *ixm;
        netstack_t      *ns;
        ip_stack_t      *ipst;
        ill_t           *ill;
        nce_t           *nce;

        /* We assume the caller hasn't initialized ixa */
        bzero(ixa, sizeof (*ixa));

        ASSERT(DB_TYPE(ixamp) == M_BREAK);
        ASSERT(ixamp->b_cont == NULL);

        ixm = (ixamblk_t *)ixamp->b_rptr;
        ASSERT(!ixm->ixm_inbound);

        /* Verify the netstack is still around */
        ns = netstack_find_by_stackid(ixm->ixm_stackid);
        if (ns == NULL) {
                /* Disappeared on us */
                (void) ip_xmit_attr_free_mblk(ixamp);
                return (B_FALSE);
        }
        ipst = ns->netstack_ip;

        /* Verify the ill is still around */
        ill = ill_lookup_on_ifindex(ixm->ixm_ifindex,
            !(ixm->ixm_flags & IXAF_IS_IPV4), ipst);

        /* We have the ill, hence the netstack can't go away */
        netstack_rele(ns);
        if (ill == NULL) {
                /* Disappeared on us */
                (void) ip_xmit_attr_free_mblk(ixamp);
                return (B_FALSE);
        }
        /*
         * Find the nce. We don't load-spread (only lookup nce's on the ill)
         * because we want to find the same nce as the one we had when
         * ip_xmit_attr_to_mblk was called.
         */
        if (ixm->ixm_flags & IXAF_IS_IPV4) {
                nce = nce_lookup_v4(ill, &ixm->ixm_nceaddr_v4);
        } else {
                nce = nce_lookup_v6(ill, &ixm->ixm_nceaddr_v6);
        }

        /* We have the nce, hence the ill can't go away */
        ill_refrele(ill);
        if (nce == NULL) {
                /*
                 * Since this is unusual and we don't know what type of
                 * nce it was, we drop the packet.
                 */
                (void) ip_xmit_attr_free_mblk(ixamp);
                return (B_FALSE);
        }

        ixa->ixa_flags = ixm->ixm_flags;
        ixa->ixa_refcnt = 1;
        ixa->ixa_ipst = ipst;
        ixa->ixa_fragsize = ixm->ixm_fragsize;
        ixa->ixa_pktlen =  ixm->ixm_pktlen;
        ixa->ixa_ip_hdr_length = ixm->ixm_ip_hdr_length;
        ixa->ixa_protocol = ixm->ixm_protocol;
        ixa->ixa_nce = nce;
        ixa->ixa_postfragfn = ixm->ixm_postfragfn;
        ixa->ixa_zoneid = ixm->ixm_zoneid;
        ixa->ixa_no_loop_zoneid = ixm->ixm_no_loop_zoneid;
        ixa->ixa_scopeid = ixm->ixm_scopeid;
        ixa->ixa_ident = ixm->ixm_ident;
        ixa->ixa_xmit_hint = ixm->ixm_xmit_hint;

        if (ixm->ixm_tsl != NULL) {
                ixa->ixa_tsl = ixm->ixm_tsl;
                ixa->ixa_free_flags |= IXA_FREE_TSL;
                ixm->ixm_tsl = NULL;
        }
        if (ixm->ixm_cred != NULL) {
                ixa->ixa_cred = ixm->ixm_cred;
                ixa->ixa_free_flags |= IXA_FREE_CRED;
                ixm->ixm_cred = NULL;
        }
        ixa->ixa_cpid = ixm->ixm_cpid;
        ixa->ixa_conn_id = ixm->ixm_conn_id;

        ixa->ixa_ipsec_ah_sa = ixm->ixm_ipsec_ah_sa;
        ixa->ixa_ipsec_esp_sa = ixm->ixm_ipsec_esp_sa;
        ixa->ixa_ipsec_policy = ixm->ixm_ipsec_policy;
        ixa->ixa_ipsec_action = ixm->ixm_ipsec_action;
        ixa->ixa_ipsec_latch = ixm->ixm_ipsec_latch;

        ixa->ixa_ipsec_ref[0] = ixm->ixm_ipsec_ref[0];
        ixa->ixa_ipsec_ref[1] = ixm->ixm_ipsec_ref[1];
        ixa->ixa_ipsec_src_port = ixm->ixm_ipsec_src_port;
        ixa->ixa_ipsec_dst_port = ixm->ixm_ipsec_dst_port;
        ixa->ixa_ipsec_icmp_type = ixm->ixm_ipsec_icmp_type;
        ixa->ixa_ipsec_icmp_code = ixm->ixm_ipsec_icmp_code;
        ixa->ixa_ipsec_inaf = ixm->ixm_ipsec_inaf;
        ixa->ixa_ipsec_insrc[0] = ixm->ixm_ipsec_insrc[0];
        ixa->ixa_ipsec_insrc[1] = ixm->ixm_ipsec_insrc[1];
        ixa->ixa_ipsec_insrc[2] = ixm->ixm_ipsec_insrc[2];
        ixa->ixa_ipsec_insrc[3] = ixm->ixm_ipsec_insrc[3];
        ixa->ixa_ipsec_indst[0] = ixm->ixm_ipsec_indst[0];
        ixa->ixa_ipsec_indst[1] = ixm->ixm_ipsec_indst[1];
        ixa->ixa_ipsec_indst[2] = ixm->ixm_ipsec_indst[2];
        ixa->ixa_ipsec_indst[3] = ixm->ixm_ipsec_indst[3];
        ixa->ixa_ipsec_insrcpfx = ixm->ixm_ipsec_insrcpfx;
        ixa->ixa_ipsec_indstpfx = ixm->ixm_ipsec_indstpfx;
        ixa->ixa_ipsec_proto = ixm->ixm_ipsec_proto;

        freeb(ixamp);
        return (B_TRUE);
}

/*
 * Free the ixm mblk and any references it holds
 * Returns b_cont.
 */
mblk_t *
ip_xmit_attr_free_mblk(mblk_t *ixamp)
{
        ixamblk_t       *ixm;
        mblk_t          *mp;

        /* Consume mp */
        ASSERT(DB_TYPE(ixamp) == M_BREAK);
        mp = ixamp->b_cont;

        ixm = (ixamblk_t *)ixamp->b_rptr;
        ASSERT(!ixm->ixm_inbound);

        if (ixm->ixm_ipsec_ah_sa != NULL) {
                IPSA_REFRELE(ixm->ixm_ipsec_ah_sa);
                ixm->ixm_ipsec_ah_sa = NULL;
        }
        if (ixm->ixm_ipsec_esp_sa != NULL) {
                IPSA_REFRELE(ixm->ixm_ipsec_esp_sa);
                ixm->ixm_ipsec_esp_sa = NULL;
        }
        if (ixm->ixm_ipsec_policy != NULL) {
                IPPOL_REFRELE(ixm->ixm_ipsec_policy);
                ixm->ixm_ipsec_policy = NULL;
        }
        if (ixm->ixm_ipsec_action != NULL) {
                IPACT_REFRELE(ixm->ixm_ipsec_action);
                ixm->ixm_ipsec_action = NULL;
        }
        if (ixm->ixm_ipsec_latch) {
                IPLATCH_REFRELE(ixm->ixm_ipsec_latch);
                ixm->ixm_ipsec_latch = NULL;
        }

        if (ixm->ixm_tsl != NULL) {
                label_rele(ixm->ixm_tsl);
                ixm->ixm_tsl = NULL;
        }
        if (ixm->ixm_cred != NULL) {
                crfree(ixm->ixm_cred);
                ixm->ixm_cred = NULL;
        }
        freeb(ixamp);
        return (mp);
}

/*
 * Take the information in ip_recv_attr_t and stick it in an mblk
 * that can later be passed to ip_recv_attr_from_mblk to recreate the
 * ip_recv_attr_t.
 *
 * Returns NULL on memory allocation failure.
 */
mblk_t *
ip_recv_attr_to_mblk(ip_recv_attr_t *ira)
{
        mblk_t          *iramp;
        iramblk_t       *irm;
        ill_t           *ill = ira->ira_ill;

        ASSERT(ira->ira_ill != NULL || ira->ira_ruifindex != 0);

        iramp = allocb(sizeof (*irm), BPRI_MED);
        if (iramp == NULL)
                return (NULL);

        iramp->b_datap->db_type = M_BREAK;
        iramp->b_wptr += sizeof (*irm);
        irm = (iramblk_t *)iramp->b_rptr;

        bzero(irm, sizeof (*irm));
        irm->irm_inbound = B_TRUE;
        irm->irm_flags = ira->ira_flags;
        if (ill != NULL) {
                /* Internal to IP - preserve ip_stack_t, ill and rill */
                irm->irm_stackid =
                    ill->ill_ipst->ips_netstack->netstack_stackid;
                irm->irm_ifindex = ira->ira_ill->ill_phyint->phyint_ifindex;
                ASSERT(ira->ira_rill->ill_phyint->phyint_ifindex ==
                    ira->ira_rifindex);
        } else {
                /* Let ip_recv_attr_from_stackid know there isn't one */
                irm->irm_stackid = -1;
        }
        irm->irm_rifindex = ira->ira_rifindex;
        irm->irm_ruifindex = ira->ira_ruifindex;
        irm->irm_pktlen = ira->ira_pktlen;
        irm->irm_ip_hdr_length = ira->ira_ip_hdr_length;
        irm->irm_protocol = ira->ira_protocol;
        irm->irm_ttl = ira->ira_ttl;

        irm->irm_sqp = ira->ira_sqp;
        irm->irm_ring = ira->ira_ring;

        irm->irm_zoneid = ira->ira_zoneid;
        irm->irm_mroute_tunnel = ira->ira_mroute_tunnel;
        irm->irm_no_loop_zoneid = ira->ira_no_loop_zoneid;
        irm->irm_esp_udp_ports = ira->ira_esp_udp_ports;

        if (ira->ira_tsl != NULL) {
                irm->irm_tsl = ira->ira_tsl;
                label_hold(irm->irm_tsl);
        }
        if (ira->ira_cred != NULL) {
                irm->irm_cred = ira->ira_cred;
                crhold(ira->ira_cred);
        }
        irm->irm_cpid = ira->ira_cpid;

        if (ira->ira_flags & IRAF_L2SRC_SET)
                bcopy(ira->ira_l2src, irm->irm_l2src, IRA_L2SRC_SIZE);

        if (ira->ira_flags & IRAF_IPSEC_SECURE) {
                if (ira->ira_ipsec_ah_sa != NULL) {
                        irm->irm_ipsec_ah_sa = ira->ira_ipsec_ah_sa;
                        IPSA_REFHOLD(ira->ira_ipsec_ah_sa);
                }
                if (ira->ira_ipsec_esp_sa != NULL) {
                        irm->irm_ipsec_esp_sa = ira->ira_ipsec_esp_sa;
                        IPSA_REFHOLD(ira->ira_ipsec_esp_sa);
                }
                if (ira->ira_ipsec_action != NULL) {
                        irm->irm_ipsec_action = ira->ira_ipsec_action;
                        IPACT_REFHOLD(ira->ira_ipsec_action);
                }
        }
        return (iramp);
}

/*
 * Extract the ip_recv_attr_t from the mblk. If we are used inside IP
 * then irm_stackid is not -1, in which case we check that the
 * ip_stack_t and ill_t still exist. Returns B_FALSE if that is
 * not the case.
 * If irm_stackid is zero then we are used by an ULP (e.g., squeue_enter)
 * and we just proceed with ira_ill and ira_rill as NULL.
 *
 * The caller needs to release any references on the pointers inside the ire
 * by calling ira_cleanup.
 */
boolean_t
ip_recv_attr_from_mblk(mblk_t *iramp, ip_recv_attr_t *ira)
{
        iramblk_t       *irm;
        netstack_t      *ns;
        ip_stack_t      *ipst = NULL;
        ill_t           *ill = NULL, *rill = NULL;

        /* We assume the caller hasn't initialized ira */
        bzero(ira, sizeof (*ira));

        ASSERT(DB_TYPE(iramp) == M_BREAK);
        ASSERT(iramp->b_cont == NULL);

        irm = (iramblk_t *)iramp->b_rptr;
        ASSERT(irm->irm_inbound);

        if (irm->irm_stackid != -1) {
                /* Verify the netstack is still around */
                ns = netstack_find_by_stackid(irm->irm_stackid);
                if (ns == NULL) {
                        /* Disappeared on us */
                        (void) ip_recv_attr_free_mblk(iramp);
                        return (B_FALSE);
                }
                ipst = ns->netstack_ip;

                /* Verify the ill is still around */
                ill = ill_lookup_on_ifindex(irm->irm_ifindex,
                    !(irm->irm_flags & IRAF_IS_IPV4), ipst);

                if (irm->irm_ifindex == irm->irm_rifindex) {
                        rill = ill;
                } else {
                        rill = ill_lookup_on_ifindex(irm->irm_rifindex,
                            !(irm->irm_flags & IRAF_IS_IPV4), ipst);
                }

                /* We have the ill, hence the netstack can't go away */
                netstack_rele(ns);
                if (ill == NULL || rill == NULL) {
                        /* Disappeared on us */
                        if (ill != NULL)
                                ill_refrele(ill);
                        if (rill != NULL && rill != ill)
                                ill_refrele(rill);
                        (void) ip_recv_attr_free_mblk(iramp);
                        return (B_FALSE);
                }
        }

        ira->ira_flags = irm->irm_flags;
        /* Caller must ill_refele(ira_ill) by using ira_cleanup() */
        ira->ira_ill = ill;
        ira->ira_rill = rill;

        ira->ira_rifindex = irm->irm_rifindex;
        ira->ira_ruifindex = irm->irm_ruifindex;
        ira->ira_pktlen = irm->irm_pktlen;
        ira->ira_ip_hdr_length = irm->irm_ip_hdr_length;
        ira->ira_protocol = irm->irm_protocol;
        ira->ira_ttl = irm->irm_ttl;

        ira->ira_sqp = irm->irm_sqp;
        /* The rest of IP assumes that the rings never go away. */
        ira->ira_ring = irm->irm_ring;

        ira->ira_zoneid = irm->irm_zoneid;
        ira->ira_mroute_tunnel = irm->irm_mroute_tunnel;
        ira->ira_no_loop_zoneid = irm->irm_no_loop_zoneid;
        ira->ira_esp_udp_ports = irm->irm_esp_udp_ports;

        if (irm->irm_tsl != NULL) {
                ira->ira_tsl = irm->irm_tsl;
                ira->ira_free_flags |= IRA_FREE_TSL;
                irm->irm_tsl = NULL;
        }
        if (irm->irm_cred != NULL) {
                ira->ira_cred = irm->irm_cred;
                ira->ira_free_flags |= IRA_FREE_CRED;
                irm->irm_cred = NULL;
        }
        ira->ira_cpid = irm->irm_cpid;

        if (ira->ira_flags & IRAF_L2SRC_SET)
                bcopy(irm->irm_l2src, ira->ira_l2src, IRA_L2SRC_SIZE);

        ira->ira_ipsec_ah_sa = irm->irm_ipsec_ah_sa;
        ira->ira_ipsec_esp_sa = irm->irm_ipsec_esp_sa;
        ira->ira_ipsec_action = irm->irm_ipsec_action;

        freeb(iramp);
        return (B_TRUE);
}

/*
 * Free the irm mblk and any references it holds
 * Returns b_cont.
 */
mblk_t *
ip_recv_attr_free_mblk(mblk_t *iramp)
{
        iramblk_t       *irm;
        mblk_t          *mp;

        /* Consume mp */
        ASSERT(DB_TYPE(iramp) == M_BREAK);
        mp = iramp->b_cont;

        irm = (iramblk_t *)iramp->b_rptr;
        ASSERT(irm->irm_inbound);

        if (irm->irm_ipsec_ah_sa != NULL) {
                IPSA_REFRELE(irm->irm_ipsec_ah_sa);
                irm->irm_ipsec_ah_sa = NULL;
        }
        if (irm->irm_ipsec_esp_sa != NULL) {
                IPSA_REFRELE(irm->irm_ipsec_esp_sa);
                irm->irm_ipsec_esp_sa = NULL;
        }
        if (irm->irm_ipsec_action != NULL) {
                IPACT_REFRELE(irm->irm_ipsec_action);
                irm->irm_ipsec_action = NULL;
        }
        if (irm->irm_tsl != NULL) {
                label_rele(irm->irm_tsl);
                irm->irm_tsl = NULL;
        }
        if (irm->irm_cred != NULL) {
                crfree(irm->irm_cred);
                irm->irm_cred = NULL;
        }

        freeb(iramp);
        return (mp);
}

/*
 * Returns true if the mblk contains an ip_recv_attr_t
 * For now we just check db_type.
 */
boolean_t
ip_recv_attr_is_mblk(mblk_t *mp)
{
        /*
         * Need to handle the various forms of tcp_timermp which are tagged
         * with b_wptr and might have a NULL b_datap.
         */
        if (mp->b_wptr == NULL || mp->b_wptr == (uchar_t *)-1)
                return (B_FALSE);

#ifdef  DEBUG
        iramblk_t       *irm;

        if (DB_TYPE(mp) != M_BREAK)
                return (B_FALSE);

        irm = (iramblk_t *)mp->b_rptr;
        ASSERT(irm->irm_inbound);
        return (B_TRUE);
#else
        return (DB_TYPE(mp) == M_BREAK);
#endif
}

static ip_xmit_attr_t *
conn_get_ixa_impl(conn_t *connp, boolean_t replace, int kmflag)
{
        ip_xmit_attr_t  *oldixa;        /* Already attached to conn_t */
        ip_xmit_attr_t  *ixa;           /* New one, which we return. */

        /*
         * NOTE: If the marked-below common case isn't, move the
         * kmem_alloc() up here and put a free in what was marked as the
         * (not really) common case instead.
         */

        mutex_enter(&connp->conn_lock);
        oldixa = connp->conn_ixa;

        /* At least one reference for the conn_t */
        ASSERT3U(oldixa->ixa_refcnt, >=, 1);
        if (atomic_inc_32_nv(&oldixa->ixa_refcnt) == 2) {
                /* No other thread using conn_ixa (common case) */
                mutex_exit(&connp->conn_lock);
                return (oldixa);
        }
        /* Do allocation inside-the-conn_lock because it's less common. */
        ixa = kmem_alloc(sizeof (*ixa), kmflag);
        if (ixa == NULL) {
                mutex_exit(&connp->conn_lock);
                IXA_REFRELE(oldixa);
                return (NULL);
        }
        ixa_safe_copy(oldixa, ixa);

        /* Make sure we drop conn_lock before any refrele */
        if (replace) {
                ixa->ixa_refcnt++;      /* No atomic needed - not visible */
                connp->conn_ixa = ixa;
                mutex_exit(&connp->conn_lock);
                IXA_REFRELE(oldixa);    /* Undo refcnt from conn_t */
        } else {
                mutex_exit(&connp->conn_lock);
        }
        IXA_REFRELE(oldixa);    /* Undo above atomic_add_32_nv */

        return (ixa);
}

/*
 * Return an ip_xmit_attr_t to use with a conn_t that ensures that only
 * the caller can access the ip_xmit_attr_t.
 *
 * If nobody else is using conn_ixa we return it.
 * Otherwise we make a "safe" copy of conn_ixa
 * and return it. The "safe" copy has the pointers set to NULL
 * (since the pointers might be changed by another thread using
 * conn_ixa). The caller needs to check for NULL pointers to see
 * if ip_set_destination needs to be called to re-establish the pointers.
 *
 * If 'replace' is set then we replace conn_ixa with the new ip_xmit_attr_t.
 * That is used when we connect() the ULP.
 */
ip_xmit_attr_t *
conn_get_ixa(conn_t *connp, boolean_t replace)
{
        return (conn_get_ixa_impl(connp, replace, KM_NOSLEEP));
}

/*
 * Used only when the option is to have the kernel hang due to not
 * cleaning up ixa references on ills etc.
 */
ip_xmit_attr_t *
conn_get_ixa_tryhard(conn_t *connp, boolean_t replace)
{
        return (conn_get_ixa_impl(connp, replace, KM_SLEEP));
}

/*
 * Replace conn_ixa with the ixa argument.
 *
 * The caller must hold conn_lock.
 *
 * We return the old ixa; the caller must ixa_refrele that after conn_lock
 * has been dropped.
 */
ip_xmit_attr_t *
conn_replace_ixa(conn_t *connp, ip_xmit_attr_t *ixa)
{
        ip_xmit_attr_t  *oldixa;

        ASSERT(MUTEX_HELD(&connp->conn_lock));

        oldixa = connp->conn_ixa;
        IXA_REFHOLD(ixa);
        ixa->ixa_conn_id = oldixa->ixa_conn_id;
        connp->conn_ixa = ixa;
        return (oldixa);
}

/*
 * Return a ip_xmit_attr_t to use with a conn_t that is based on but
 * separate from conn_ixa.
 *
 * This "safe" copy has the pointers set to NULL
 * (since the pointers might be changed by another thread using
 * conn_ixa). The caller needs to check for NULL pointers to see
 * if ip_set_destination needs to be called to re-establish the pointers.
 */
ip_xmit_attr_t *
conn_get_ixa_exclusive(conn_t *connp)
{
        ip_xmit_attr_t *oldixa;
        ip_xmit_attr_t *ixa;

        ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP_LAZY);
        if (ixa == NULL)
                return (NULL);

        mutex_enter(&connp->conn_lock);

        oldixa = connp->conn_ixa;
        IXA_REFHOLD(oldixa);

        ixa_safe_copy(oldixa, ixa);
        mutex_exit(&connp->conn_lock);
        IXA_REFRELE(oldixa);
        return (ixa);
}

void
ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
{
        bcopy(src, ixa, sizeof (*ixa));
        ixa->ixa_refcnt = 1;
        /*
         * Clear any pointers that have references and might be changed
         * by ip_set_destination or the ULP
         */
        ixa->ixa_ire = NULL;
        ixa->ixa_nce = NULL;
        ixa->ixa_dce = NULL;
        ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
        ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
#ifdef DEBUG
        ixa->ixa_curthread = NULL;
#endif
        /* Clear all the IPsec pointers and the flag as well. */
        ixa->ixa_flags &= ~IXAF_IPSEC_SECURE;

        ixa->ixa_ipsec_latch = NULL;
        ixa->ixa_ipsec_ah_sa = NULL;
        ixa->ixa_ipsec_esp_sa = NULL;
        ixa->ixa_ipsec_policy = NULL;
        ixa->ixa_ipsec_action = NULL;

        /*
         * We leave ixa_tsl unchanged, but if it has a refhold we need
         * to get an extra refhold.
         */
        if (ixa->ixa_free_flags & IXA_FREE_TSL)
                label_hold(ixa->ixa_tsl);

        /*
         * We leave ixa_cred unchanged, but if it has a refhold we need
         * to get an extra refhold.
         */
        if (ixa->ixa_free_flags & IXA_FREE_CRED)
                crhold(ixa->ixa_cred);

        /*
         * There is no cleanup in progress on this new copy.
         */
        ixa->ixa_tcpcleanup = IXATC_IDLE;
}

/*
 * Duplicate an ip_xmit_attr_t.
 * Assumes that the caller controls the ixa, hence we do not need to use
 * a safe copy. We just have to increase the refcnt on any pointers.
 */
ip_xmit_attr_t *
ip_xmit_attr_duplicate(ip_xmit_attr_t *src_ixa)
{
        ip_xmit_attr_t *ixa;

        ixa = kmem_alloc(sizeof (*ixa), KM_NOSLEEP);
        if (ixa == NULL)
                return (NULL);
        bcopy(src_ixa, ixa, sizeof (*ixa));
        ixa->ixa_refcnt = 1;

        if (ixa->ixa_ire != NULL)
                ire_refhold_notr(ixa->ixa_ire);
        if (ixa->ixa_nce != NULL)
                nce_refhold(ixa->ixa_nce);
        if (ixa->ixa_dce != NULL)
                dce_refhold_notr(ixa->ixa_dce);

#ifdef DEBUG
        ixa->ixa_curthread = NULL;
#endif

        if (ixa->ixa_ipsec_latch != NULL)
                IPLATCH_REFHOLD(ixa->ixa_ipsec_latch);
        if (ixa->ixa_ipsec_ah_sa != NULL)
                IPSA_REFHOLD(ixa->ixa_ipsec_ah_sa);
        if (ixa->ixa_ipsec_esp_sa != NULL)
                IPSA_REFHOLD(ixa->ixa_ipsec_esp_sa);
        if (ixa->ixa_ipsec_policy != NULL)
                IPPOL_REFHOLD(ixa->ixa_ipsec_policy);
        if (ixa->ixa_ipsec_action != NULL)
                IPACT_REFHOLD(ixa->ixa_ipsec_action);

        if (ixa->ixa_tsl != NULL) {
                label_hold(ixa->ixa_tsl);
                ixa->ixa_free_flags |= IXA_FREE_TSL;
        }
        if (ixa->ixa_cred != NULL) {
                crhold(ixa->ixa_cred);
                ixa->ixa_free_flags |= IXA_FREE_CRED;
        }
        return (ixa);
}

/*
 * Used to replace the ixa_label field.
 * The caller should have a reference on the label, which we transfer to
 * the attributes so that when the attribute is freed/cleaned up
 * we will release that reference.
 */
void
ip_xmit_attr_replace_tsl(ip_xmit_attr_t *ixa, ts_label_t *tsl)
{
        ASSERT(tsl != NULL);

        if (ixa->ixa_free_flags & IXA_FREE_TSL) {
                ASSERT(ixa->ixa_tsl != NULL);
                label_rele(ixa->ixa_tsl);
        } else {
                ixa->ixa_free_flags |= IXA_FREE_TSL;
        }
        ixa->ixa_tsl = tsl;
}

/*
 * Replace the ip_recv_attr_t's label.
 * Due to kernel RPC's use of db_credp we also need to replace ira_cred;
 * TCP/UDP uses ira_cred to set db_credp for non-socket users.
 * This can fail (and return B_FALSE) due to lack of memory.
 */
boolean_t
ip_recv_attr_replace_label(ip_recv_attr_t *ira, ts_label_t *tsl)
{
        cred_t  *newcr;

        if (ira->ira_free_flags & IRA_FREE_TSL) {
                ASSERT(ira->ira_tsl != NULL);
                label_rele(ira->ira_tsl);
        }
        label_hold(tsl);
        ira->ira_tsl = tsl;
        ira->ira_free_flags |= IRA_FREE_TSL;

        /*
         * Reset zoneid if we have a shared address. That allows
         * ip_fanout_tx_v4/v6 to determine the zoneid again.
         */
        if (ira->ira_flags & IRAF_TX_SHARED_ADDR)
                ira->ira_zoneid = ALL_ZONES;

        /* We update ira_cred for RPC */
        newcr = copycred_from_tslabel(ira->ira_cred, ira->ira_tsl, KM_NOSLEEP);
        if (newcr == NULL)
                return (B_FALSE);
        if (ira->ira_free_flags & IRA_FREE_CRED)
                crfree(ira->ira_cred);
        ira->ira_cred = newcr;
        ira->ira_free_flags |= IRA_FREE_CRED;
        return (B_TRUE);
}

/*
 * This needs to be called after ip_set_destination/tsol_check_dest might
 * have changed ixa_tsl to be specific for a destination, and we now want to
 * send to a different destination.
 * We have to restart with crgetlabel() since ip_set_destination/
 * tsol_check_dest will start with ixa_tsl.
 */
void
ip_xmit_attr_restore_tsl(ip_xmit_attr_t *ixa, cred_t *cr)
{
        if (!is_system_labeled())
                return;

        if (ixa->ixa_free_flags & IXA_FREE_TSL) {
                ASSERT(ixa->ixa_tsl != NULL);
                label_rele(ixa->ixa_tsl);
                ixa->ixa_free_flags &= ~IXA_FREE_TSL;
        }
        ixa->ixa_tsl = crgetlabel(cr);
}

void
ixa_refrele(ip_xmit_attr_t *ixa)
{
        IXA_REFRELE(ixa);
}

void
ixa_inactive(ip_xmit_attr_t *ixa)
{
        ASSERT(ixa->ixa_refcnt == 0);

        ixa_cleanup(ixa);
        kmem_free(ixa, sizeof (*ixa));
}

/*
 * Release any references contained in the ixa.
 * Also clear any fields that are not controlled by ixa_flags.
 */
void
ixa_cleanup(ip_xmit_attr_t *ixa)
{
        if (ixa->ixa_ire != NULL) {
                ire_refrele_notr(ixa->ixa_ire);
                ixa->ixa_ire = NULL;
        }
        if (ixa->ixa_dce != NULL) {
                dce_refrele_notr(ixa->ixa_dce);
                ixa->ixa_dce = NULL;
        }
        if (ixa->ixa_nce != NULL) {
                nce_refrele(ixa->ixa_nce);
                ixa->ixa_nce = NULL;
        }
        ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
        ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
        if (ixa->ixa_flags & IXAF_IPSEC_SECURE) {
                ipsec_out_release_refs(ixa);
        }
        if (ixa->ixa_free_flags & IXA_FREE_TSL) {
                ASSERT(ixa->ixa_tsl != NULL);
                label_rele(ixa->ixa_tsl);
                ixa->ixa_free_flags &= ~IXA_FREE_TSL;
        }
        ixa->ixa_tsl = NULL;
        if (ixa->ixa_free_flags & IXA_FREE_CRED) {
                ASSERT(ixa->ixa_cred != NULL);
                crfree(ixa->ixa_cred);
                ixa->ixa_free_flags &= ~IXA_FREE_CRED;
        }
        ixa->ixa_cred = NULL;
        ixa->ixa_src_preferences = 0;
        ixa->ixa_ifindex = 0;
        ixa->ixa_multicast_ifindex = 0;
        ixa->ixa_multicast_ifaddr = INADDR_ANY;
}

/*
 * Release any references contained in the ira.
 * Callers which use ip_recv_attr_from_mblk() would pass B_TRUE as the second
 * argument.
 */
void
ira_cleanup(ip_recv_attr_t *ira, boolean_t refrele_ill)
{
        if (ira->ira_ill != NULL) {
                if (ira->ira_rill != ira->ira_ill) {
                        /* Caused by async processing */
                        ill_refrele(ira->ira_rill);
                }
                if (refrele_ill)
                        ill_refrele(ira->ira_ill);
        }
        if (ira->ira_flags & IRAF_IPSEC_SECURE) {
                ipsec_in_release_refs(ira);
        }
        if (ira->ira_free_flags & IRA_FREE_TSL) {
                ASSERT(ira->ira_tsl != NULL);
                label_rele(ira->ira_tsl);
                ira->ira_free_flags &= ~IRA_FREE_TSL;
        }
        ira->ira_tsl = NULL;
        if (ira->ira_free_flags & IRA_FREE_CRED) {
                ASSERT(ira->ira_cred != NULL);
                crfree(ira->ira_cred);
                ira->ira_free_flags &= ~IRA_FREE_CRED;
        }
        ira->ira_cred = NULL;
}

/*
 * Function to help release any IRE, NCE, or DCEs that
 * have been deleted and are marked as condemned.
 * The caller is responsible for any serialization which is different
 * for TCP, SCTP, and others.
 */
static void
ixa_cleanup_stale(ip_xmit_attr_t *ixa)
{
        ire_t           *ire;
        nce_t           *nce;
        dce_t           *dce;

        ire = ixa->ixa_ire;
        nce = ixa->ixa_nce;
        dce = ixa->ixa_dce;

        if (ire != NULL && IRE_IS_CONDEMNED(ire)) {
                ire_refrele_notr(ire);
                ire = ire_blackhole(ixa->ixa_ipst,
                    !(ixa->ixa_flags & IXAF_IS_IPV4));
                ASSERT(ire != NULL);
#ifdef DEBUG
                ire_refhold_notr(ire);
                ire_refrele(ire);
#endif
                ixa->ixa_ire = ire;
                ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
        }
        if (nce != NULL && nce->nce_is_condemned) {
                /* Can make it NULL as long as we set IRE_GENERATION_VERIFY */
                nce_refrele(nce);
                ixa->ixa_nce = NULL;
                ixa->ixa_ire_generation = IRE_GENERATION_VERIFY;
        }
        if (dce != NULL && DCE_IS_CONDEMNED(dce)) {
                dce_refrele_notr(dce);
                dce = dce_get_default(ixa->ixa_ipst);
                ASSERT(dce != NULL);
#ifdef DEBUG
                dce_refhold_notr(dce);
                dce_refrele(dce);
#endif
                ixa->ixa_dce = dce;
                ixa->ixa_dce_generation = DCE_GENERATION_VERIFY;
        }
}

static mblk_t *
tcp_ixa_cleanup_getmblk(conn_t *connp)
{
        tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
        int need_retry;
        mblk_t *mp;

        mutex_enter(&tcps->tcps_ixa_cleanup_lock);

        /*
         * It's possible that someone else came in and started cleaning up
         * another connection between the time we verified this one is not being
         * cleaned up and the time we actually get the shared mblk.  If that's
         * the case, we've dropped the lock, and some other thread may have
         * cleaned up this connection again, and is still waiting for
         * notification of that cleanup's completion.  Therefore we need to
         * recheck.
         */
        do {
                need_retry = 0;
                while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
                        cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
                            &tcps->tcps_ixa_cleanup_lock);
                }

                while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
                        /*
                         * Multiple concurrent cleanups; need to have the last
                         * one run since it could be an unplumb.
                         */
                        need_retry = 1;
                        cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
                            &tcps->tcps_ixa_cleanup_lock);
                }
        } while (need_retry);

        /*
         * We now have the lock and the mblk; now make sure that no one else can
         * try to clean up this connection or enqueue it for cleanup, clear the
         * mblk pointer for this stack, drop the lock, and return the mblk.
         */
        ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
        ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
        ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
        ASSERT(mp != NULL);

        connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
        tcps->tcps_ixa_cleanup_mp = NULL;
        mutex_exit(&tcps->tcps_ixa_cleanup_lock);

        return (mp);
}

/*
 * Used to run ixa_cleanup_stale inside the tcp squeue.
 * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
 * and waking up the caller.
 */
/* ARGSUSED2 */
static void
tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
    ip_recv_attr_t *dummy)
{
        conn_t  *connp = (conn_t *)arg;
        tcp_stack_t     *tcps;

        tcps = connp->conn_netstack->netstack_tcp;

        ixa_cleanup_stale(connp->conn_ixa);

        mutex_enter(&tcps->tcps_ixa_cleanup_lock);
        ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
        connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
        tcps->tcps_ixa_cleanup_mp = mp;
        cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
        /*
         * It is possible for any number of threads to be waiting for cleanup of
         * different connections.  Absent a per-connection (or per-IXA) CV, we
         * need to wake them all up even though only one can be waiting on this
         * particular cleanup.
         */
        cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
        mutex_exit(&tcps->tcps_ixa_cleanup_lock);
}

static void
tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
{
        tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;

        mutex_enter(&tcps->tcps_ixa_cleanup_lock);

        ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);

        while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
                cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
                    &tcps->tcps_ixa_cleanup_lock);
        }

        ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
        connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
        cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);

        mutex_exit(&tcps->tcps_ixa_cleanup_lock);
}

/*
 * ipcl_walk() function to help release any IRE, NCE, or DCEs that
 * have been deleted and are marked as condemned.
 * Note that we can't cleanup the pointers since there can be threads
 * in conn_ip_output() sending while we are called.
 */
void
conn_ixa_cleanup(conn_t *connp, void *arg)
{
        boolean_t tryhard = (boolean_t)arg;

        if (IPCL_IS_TCP(connp)) {
                mblk_t          *mp;

                mp = tcp_ixa_cleanup_getmblk(connp);

                if (connp->conn_sqp->sq_run == curthread) {
                        /* Already on squeue */
                        tcp_ixa_cleanup(connp, mp, NULL, NULL);
                } else {
                        CONN_INC_REF(connp);
                        SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
                            connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
                }
                tcp_ixa_cleanup_wait_and_finish(connp);
        } else if (IPCL_IS_SCTP(connp)) {
                sctp_t  *sctp;
                sctp_faddr_t *fp;

                sctp = CONN2SCTP(connp);
                RUN_SCTP(sctp);
                ixa_cleanup_stale(connp->conn_ixa);
                for (fp = sctp->sctp_faddrs; fp != NULL; fp = fp->sf_next)
                        ixa_cleanup_stale(fp->sf_ixa);
                WAKE_SCTP(sctp);
        } else {
                ip_xmit_attr_t  *ixa;

                /*
                 * If there is a different thread using conn_ixa then we get a
                 * new copy and cut the old one loose from conn_ixa. Otherwise
                 * we use conn_ixa and prevent any other thread from
                 * using/changing it. Anybody using conn_ixa (e.g., a thread in
                 * conn_ip_output) will do an ixa_refrele which will remove any
                 * references on the ire etc.
                 *
                 * Once we are done other threads can use conn_ixa since the
                 * refcnt will be back at one.
                 *
                 * We are called either because an ill is going away, or
                 * due to memory reclaim. In the former case we wait for
                 * memory since we must remove the refcnts on the ill.
                 */
                if (tryhard) {
                        ixa = conn_get_ixa_tryhard(connp, B_TRUE);
                        ASSERT(ixa != NULL);
                } else {
                        ixa = conn_get_ixa(connp, B_TRUE);
                        if (ixa == NULL) {
                                /*
                                 * Somebody else was using it and kmem_alloc
                                 * failed! Next memory reclaim will try to
                                 * clean up.
                                 */
                                DTRACE_PROBE1(conn__ixa__cleanup__bail,
                                    conn_t *, connp);
                                return;
                        }
                }
                ixa_cleanup_stale(ixa);
                IXA_REFRELE(ixa);
        }
}

/*
 * ixa needs to be an exclusive copy so that no one changes the cookie
 * or the ixa_nce.
 */
boolean_t
ixa_check_drain_insert(conn_t *connp, ip_xmit_attr_t *ixa)
{
        uintptr_t cookie = ixa->ixa_cookie;
        ill_dld_direct_t *idd;
        idl_tx_list_t *idl_txl;
        ill_t *ill = ixa->ixa_nce->nce_ill;
        boolean_t inserted = B_FALSE;

        idd = &(ill)->ill_dld_capab->idc_direct;
        idl_txl = &ixa->ixa_ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
        mutex_enter(&idl_txl->txl_lock);

        /*
         * If `cookie' is zero, ip_xmit() -> canputnext() failed -- i.e., flow
         * control is asserted on an ill that does not support direct calls.
         * Jump to insert.
         */
        if (cookie == 0)
                goto tryinsert;

        ASSERT(ILL_DIRECT_CAPABLE(ill));

        if (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, cookie) == 0) {
                DTRACE_PROBE1(ill__tx__not__blocked, uintptr_t, cookie);
        } else if (idl_txl->txl_cookie != (uintptr_t)NULL &&
            idl_txl->txl_cookie != ixa->ixa_cookie) {
                DTRACE_PROBE2(ill__tx__cookie__collision, uintptr_t, cookie,
                    uintptr_t, idl_txl->txl_cookie);
                /* TODO: bump kstat for cookie collision */
        } else {
                /*
                 * Check/set conn_blocked under conn_lock.  Note that txl_lock
                 * will not suffice since two separate UDP threads may be
                 * racing to send to different destinations that are
                 * associated with different cookies and thus may not be
                 * holding the same txl_lock.  Further, since a given conn_t
                 * can only be on a single drain list, the conn_t will be
                 * enqueued on whichever thread wins this race.
                 */
tryinsert:      mutex_enter(&connp->conn_lock);
                if (connp->conn_blocked) {
                        DTRACE_PROBE1(ill__tx__conn__already__blocked,
                            conn_t *, connp);
                        mutex_exit(&connp->conn_lock);
                } else {
                        connp->conn_blocked = B_TRUE;
                        mutex_exit(&connp->conn_lock);
                        idl_txl->txl_cookie = cookie;
                        conn_drain_insert(connp, idl_txl);
                        if (!IPCL_IS_NONSTR(connp))
                                noenable(connp->conn_wq);
                        inserted = B_TRUE;
                }
        }
        mutex_exit(&idl_txl->txl_lock);
        return (inserted);
}