root/usr/src/uts/common/inet/tcp/tcp_opt_data.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
 * Copyright 2019 Joyent, Inc.
 * Copyright (c) 2016 by Delphix. All rights reserved.
 * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
 * Copyright 2024 Oxide Computer Company
 */

#include <sys/types.h>
#include <sys/stream.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/socket.h>
#include <sys/xti_xtiopt.h>
#include <sys/xti_inet.h>
#include <sys/policy.h>

#include <inet/cc.h>
#include <inet/common.h>
#include <netinet/ip6.h>
#include <inet/ip.h>

#include <netinet/in.h>
#include <netinet/tcp.h>
#include <inet/optcom.h>
#include <inet/proto_set.h>
#include <inet/tcp_impl.h>

static int      tcp_opt_default(queue_t *, int, int, uchar_t *);

/*
 * Table of all known options handled on a TCP protocol stack.
 *
 * Note: This table contains options processed by both TCP and IP levels
 *       and is the superset of options that can be performed on a TCP over IP
 *       stack.
 */
opdes_t tcp_opt_arr[] = {

{ SO_LINGER,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
        sizeof (struct linger), 0 },

{ SO_DEBUG,     SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_KEEPALIVE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_DONTROUTE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
        },
{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_TYPE,      SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_RCVBUF,    SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
        sizeof (struct timeval), 0 },
{ SO_RCVTIMEO,  SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0,
        sizeof (struct timeval), 0 },
{ SO_DGRAM_ERRIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
        },
{ SO_SND_COPYAVOID, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_ANON_MLP, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
        0 },
{ SO_MAC_EXEMPT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
        0 },
{ SO_MAC_IMPLICIT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
        0 },
{ SO_ALLZONES, SOL_SOCKET, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
        0 },
{ SO_EXCLBIND, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ SO_DOMAIN,    SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },

{ SO_PROTOTYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },

{ TCP_NODELAY,  IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
        },
{ TCP_MAXSEG,   IPPROTO_TCP, OA_R, OA_R, OP_NP, 0, sizeof (uint_t),
        536 },

{ TCP_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
        OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
        OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_CONN_NOTIFY_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
        OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_CONN_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
        OP_DEF_FN, sizeof (int), -1 /* not initialized */ },

{ TCP_RECVDSTADDR, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int),
        0 },

{ TCP_ANONPRIVBIND, IPPROTO_TCP, OA_R, OA_RW, OP_PRIVPORT, 0,
        sizeof (int), 0 },

{ TCP_EXCLBIND, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
        },

{ TCP_INIT_CWND, IPPROTO_TCP, OA_RW, OA_RW, OP_CONFIG, 0,
        sizeof (int), 0 },

{ TCP_KEEPALIVE_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },

{ TCP_KEEPIDLE, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_KEEPCNT, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_KEEPINTVL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_KEEPALIVE_ABORT_THRESHOLD, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },

{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_QUICKACK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_MD5SIG, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },

{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },

{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },

{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ TCP_CONGESTION, IPPROTO_TCP, OA_RW, OA_RW, OP_NP,
        OP_VARLEN, CC_ALGO_NAME_MAX, 0 },

{ IP_OPTIONS,   IPPROTO_IP, OA_RW, OA_RW, OP_NP,
        (OP_VARLEN|OP_NODEFAULT),
        IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
{ T_IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
        (OP_VARLEN|OP_NODEFAULT),
        IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },

{ IP_TOS,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ T_IP_TOS,     IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ IP_TTL,       IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
        sizeof (int), -1 /* not initialized */ },
{ IP_RECVTOS,   IPPROTO_IP,  OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ IP_SEC_OPT, IPPROTO_IP, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
        sizeof (ipsec_req_t), -1 /* not initialized */ },

{ IP_BOUND_IF, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int),   0 /* no ifindex */ },

{ IP_UNSPEC_SRC, IPPROTO_IP, OA_R, OA_RW, OP_RAW, 0,
        sizeof (int), 0 },

{ IP_MINTTL,    IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ IPV6_UNICAST_HOPS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_DEF_FN,
        sizeof (int), -1 /* not initialized */ },

{ IPV6_BOUND_IF, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int),   0 /* no ifindex */ },

{ IP_DONTFRAG, IPPROTO_IP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },

{ IP_NEXTHOP, IPPROTO_IP, OA_R, OA_RW, OP_CONFIG, 0,
        sizeof (in_addr_t),     -1 /* not initialized  */ },

{ IPV6_UNSPEC_SRC, IPPROTO_IPV6, OA_R, OA_RW, OP_RAW, 0,
        sizeof (int), 0 },

{ IPV6_PKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        (OP_NODEFAULT|OP_VARLEN),
        sizeof (struct in6_pktinfo), -1 /* not initialized */ },
{ IPV6_NEXTHOP, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        OP_NODEFAULT,
        sizeof (sin6_t), -1 /* not initialized */ },
{ IPV6_HOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        (OP_VARLEN|OP_NODEFAULT), 255*8,
        -1 /* not initialized */ },
{ IPV6_DSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        (OP_VARLEN|OP_NODEFAULT), 255*8,
        -1 /* not initialized */ },
{ IPV6_RTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        (OP_VARLEN|OP_NODEFAULT), 255*8,
        -1 /* not initialized */ },
{ IPV6_RTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        (OP_VARLEN|OP_NODEFAULT), 255*8,
        -1 /* not initialized */ },
{ IPV6_TCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        OP_NODEFAULT,
        sizeof (int), -1 /* not initialized */ },
{ IPV6_PATHMTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP,
        OP_NODEFAULT,
        sizeof (struct ip6_mtuinfo), -1 /* not initialized */ },
{ IPV6_DONTFRAG, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_USE_MIN_MTU, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_V6ONLY, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },

/* Enable receipt of ancillary data */
{ IPV6_RECVPKTINFO, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_RECVHOPLIMIT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_RECVHOPOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ _OLD_IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_RECVDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_RECVRTHDR, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_RECVRTHDRDSTOPTS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
{ IPV6_RECVTCLASS, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },

{ IPV6_SEC_OPT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, OP_NODEFAULT,
        sizeof (ipsec_req_t), -1 /* not initialized */ },
{ IPV6_SRC_PREFERENCES, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (uint32_t), IPV6_PREFER_SRC_DEFAULT },

{ IPV6_MINHOPCOUNT, IPPROTO_IPV6, OA_RW, OA_RW, OP_NP, 0,
        sizeof (int), 0 },
};

/*
 * Table of all supported levels
 * Note: Some levels (e.g. XTI_GENERIC) may be valid but may not have
 * any supported options so we need this info separately.
 *
 * This is needed only for topmost tpi providers and is used only by
 * XTI interfaces.
 */
optlevel_t      tcp_valid_levels_arr[] = {
        XTI_GENERIC,
        SOL_SOCKET,
        IPPROTO_TCP,
        IPPROTO_IP,
        IPPROTO_IPV6
};


#define TCP_OPT_ARR_CNT         A_CNT(tcp_opt_arr)
#define TCP_VALID_LEVELS_CNT    A_CNT(tcp_valid_levels_arr)

uint_t tcp_max_optsize; /* initialized when TCP driver is loaded */

/*
 * Initialize option database object for TCP
 *
 * This object represents database of options to search passed to
 * {sock,tpi}optcom_req() interface routine to take care of option
 * management and associated methods.
 */

optdb_obj_t tcp_opt_obj = {
        tcp_opt_default,        /* TCP default value function pointer */
        tcp_tpi_opt_get,        /* TCP get function pointer */
        tcp_tpi_opt_set,        /* TCP set function pointer */
        TCP_OPT_ARR_CNT,        /* TCP option database count of entries */
        tcp_opt_arr,            /* TCP option database */
        TCP_VALID_LEVELS_CNT,   /* TCP valid level count of entries */
        tcp_valid_levels_arr    /* TCP valid level array */
};

static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND;

/*
 * Some TCP options can be "set" by requesting them in the option
 * buffer. This is needed for XTI feature test though we do not
 * allow it in general. We interpret that this mechanism is more
 * applicable to OSI protocols and need not be allowed in general.
 * This routine filters out options for which it is not allowed (most)
 * and lets through those (few) for which it is. [ The XTI interface
 * test suite specifics will imply that any XTI_GENERIC level XTI_* if
 * ever implemented will have to be allowed here ].
 */
static boolean_t
tcp_allow_connopt_set(int level, int name)
{

        switch (level) {
        case IPPROTO_TCP:
                switch (name) {
                case TCP_NODELAY:
                        return (B_TRUE);
                default:
                        return (B_FALSE);
                }
                /*NOTREACHED*/
        default:
                return (B_FALSE);
        }
        /*NOTREACHED*/
}

/*
 * This routine gets default values of certain options whose default
 * values are maintained by protocol specific code
 */
/* ARGSUSED */
static int
tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr)
{
        int32_t *i1 = (int32_t *)ptr;
        tcp_stack_t     *tcps = Q_TO_TCP(q)->tcp_tcps;

        switch (level) {
        case IPPROTO_TCP:
                switch (name) {
                case TCP_NOTIFY_THRESHOLD:
                        *i1 = tcps->tcps_ip_notify_interval;
                        break;
                case TCP_ABORT_THRESHOLD:
                        *i1 = tcps->tcps_ip_abort_interval;
                        break;
                case TCP_CONN_NOTIFY_THRESHOLD:
                        *i1 = tcps->tcps_ip_notify_cinterval;
                        break;
                case TCP_CONN_ABORT_THRESHOLD:
                        *i1 = tcps->tcps_ip_abort_cinterval;
                        break;
                default:
                        return (-1);
                }
                break;
        case IPPROTO_IP:
                switch (name) {
                case IP_TTL:
                        *i1 = tcps->tcps_ipv4_ttl;
                        break;
                default:
                        return (-1);
                }
                break;
        case IPPROTO_IPV6:
                switch (name) {
                case IPV6_UNICAST_HOPS:
                        *i1 = tcps->tcps_ipv6_hoplimit;
                        break;
                default:
                        return (-1);
                }
                break;
        default:
                return (-1);
        }
        return (sizeof (int));
}

/*
 * TCP routine to get the values of options.
 */
int
tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
{
        int             *i1 = (int *)ptr;
        tcp_t           *tcp = connp->conn_tcp;
        conn_opt_arg_t  coas;
        int             retval;

        coas.coa_connp = connp;
        coas.coa_ixa = connp->conn_ixa;
        coas.coa_ipp = &connp->conn_xmit_ipp;
        coas.coa_ancillary = B_FALSE;
        coas.coa_changed = 0;

        switch (level) {
        case SOL_SOCKET:
                switch (name) {
                case SO_SND_COPYAVOID:
                        *i1 = tcp->tcp_snd_zcopy_on ?
                            SO_SND_COPYAVOID : 0;
                        return (sizeof (int));
                case SO_ACCEPTCONN:
                        *i1 = (tcp->tcp_state == TCPS_LISTEN);
                        return (sizeof (int));
                }
                break;
        case IPPROTO_TCP:
                switch (name) {
                case TCP_NODELAY:
                        *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0;
                        return (sizeof (int));
                case TCP_MAXSEG:
                        *i1 = tcp->tcp_mss;
                        return (sizeof (int));
                case TCP_NOTIFY_THRESHOLD:
                        *i1 = (int)tcp->tcp_first_timer_threshold;
                        return (sizeof (int));
                case TCP_ABORT_THRESHOLD:
                        *i1 = tcp->tcp_second_timer_threshold;
                        return (sizeof (int));
                case TCP_CONN_NOTIFY_THRESHOLD:
                        *i1 = tcp->tcp_first_ctimer_threshold;
                        return (sizeof (int));
                case TCP_CONN_ABORT_THRESHOLD:
                        *i1 = tcp->tcp_second_ctimer_threshold;
                        return (sizeof (int));
                case TCP_INIT_CWND:
                        *i1 = tcp->tcp_init_cwnd;
                        return (sizeof (int));
                case TCP_KEEPALIVE_THRESHOLD:
                        *i1 = tcp->tcp_ka_interval;
                        return (sizeof (int));

                /*
                 * TCP_KEEPIDLE expects value in seconds, but
                 * tcp_ka_interval is in milliseconds.
                 */
                case TCP_KEEPIDLE:
                        *i1 = tcp->tcp_ka_interval / 1000;
                        return (sizeof (int));
                case TCP_KEEPCNT:
                        *i1 = tcp->tcp_ka_cnt;
                        return (sizeof (int));

                /*
                 * TCP_KEEPINTVL expects value in seconds, but
                 * tcp_ka_rinterval is in milliseconds.
                 */
                case TCP_KEEPINTVL:
                        *i1 = tcp->tcp_ka_rinterval / 1000;
                        return (sizeof (int));
                case TCP_KEEPALIVE_ABORT_THRESHOLD:
                        *i1 = tcp->tcp_ka_abort_thres;
                        return (sizeof (int));
                case TCP_CONGESTION: {
                        size_t len = strlcpy((char *)ptr, CC_ALGO(tcp)->name,
                            CC_ALGO_NAME_MAX);
                        if (len >= CC_ALGO_NAME_MAX)
                                return (-1);
                        return (len + 1);
                }
                case TCP_CORK:
                        *i1 = tcp->tcp_cork;
                        return (sizeof (int));
                case TCP_QUICKACK:
                        *i1 = tcp->tcp_quickack;
                        return (sizeof (int));
                case TCP_MD5SIG:
                        *i1 = tcp->tcp_md5sig;
                        return (sizeof (int));
                case TCP_RTO_INITIAL:
                        *i1 = tcp->tcp_rto_initial;
                        return (sizeof (uint32_t));
                case TCP_RTO_MIN:
                        *i1 = tcp->tcp_rto_min;
                        return (sizeof (uint32_t));
                case TCP_RTO_MAX:
                        *i1 = tcp->tcp_rto_max;
                        return (sizeof (uint32_t));
                case TCP_LINGER2:
                        *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
                        return (sizeof (int));
                }
                break;
        case IPPROTO_IP:
                if (connp->conn_family != AF_INET)
                        return (-1);
                switch (name) {
                case IP_OPTIONS:
                case T_IP_OPTIONS:
                        /* Caller ensures enough space */
                        return (ip_opt_get_user(connp, ptr));
                default:
                        break;
                }
                break;

        case IPPROTO_IPV6:
                /*
                 * IPPROTO_IPV6 options are only supported for sockets
                 * that are using IPv6 on the wire.
                 */
                if (connp->conn_ipversion != IPV6_VERSION) {
                        return (-1);
                }
                switch (name) {
                case IPV6_PATHMTU:
                        if (tcp->tcp_state < TCPS_ESTABLISHED)
                                return (-1);
                        break;
                }
                break;
        }
        mutex_enter(&connp->conn_lock);
        retval = conn_opt_get(&coas, level, name, ptr);
        mutex_exit(&connp->conn_lock);
        return (retval);
}

/*
 * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
 * Parameters are assumed to be verified by the caller.
 */
/* ARGSUSED */
int
tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
    uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
    void *thisdg_attrs, cred_t *cr)
{
        tcp_t   *tcp = connp->conn_tcp;
        int     *i1 = (int *)invalp;
        boolean_t onoff = (*i1 == 0) ? 0 : 1;
        boolean_t checkonly;
        int     reterr;
        tcp_stack_t     *tcps = tcp->tcp_tcps;
        conn_opt_arg_t  coas;
        uint32_t        val = *((uint32_t *)invalp);

        coas.coa_connp = connp;
        coas.coa_ixa = connp->conn_ixa;
        coas.coa_ipp = &connp->conn_xmit_ipp;
        coas.coa_ancillary = B_FALSE;
        coas.coa_changed = 0;

        switch (optset_context) {
        case SETFN_OPTCOM_CHECKONLY:
                checkonly = B_TRUE;
                /*
                 * Note: Implies T_CHECK semantics for T_OPTCOM_REQ
                 * inlen != 0 implies value supplied and
                 *      we have to "pretend" to set it.
                 * inlen == 0 implies that there is no
                 *      value part in T_CHECK request and just validation
                 * done elsewhere should be enough, we just return here.
                 */
                if (inlen == 0) {
                        *outlenp = 0;
                        return (0);
                }
                break;
        case SETFN_OPTCOM_NEGOTIATE:
                checkonly = B_FALSE;
                break;
        case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */
        case SETFN_CONN_NEGOTIATE:
                checkonly = B_FALSE;
                /*
                 * Negotiating local and "association-related" options
                 * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ)
                 * primitives is allowed by XTI, but we choose
                 * to not implement this style negotiation for Internet
                 * protocols (We interpret it is a must for OSI world but
                 * optional for Internet protocols) for all options.
                 * [ Will do only for the few options that enable test
                 * suites that our XTI implementation of this feature
                 * works for transports that do allow it ]
                 */
                if (!tcp_allow_connopt_set(level, name)) {
                        *outlenp = 0;
                        return (EINVAL);
                }
                break;
        default:
                /*
                 * We should never get here
                 */
                *outlenp = 0;
                return (EINVAL);
        }

        ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) ||
            (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0));

        /*
         * For TCP, we should have no ancillary data sent down
         * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs
         * has to be zero.
         */
        ASSERT(thisdg_attrs == NULL);

        /*
         * For fixed length options, no sanity check
         * of passed in length is done. It is assumed *_optcom_req()
         * routines do the right thing.
         */
        switch (level) {
        case SOL_SOCKET:
                switch (name) {
                case SO_KEEPALIVE:
                        if (checkonly) {
                                /* check only case */
                                break;
                        }

                        if (!onoff) {
                                if (connp->conn_keepalive) {
                                        if (tcp->tcp_ka_tid != 0) {
                                                (void) TCP_TIMER_CANCEL(tcp,
                                                    tcp->tcp_ka_tid);
                                                tcp->tcp_ka_tid = 0;
                                        }
                                        connp->conn_keepalive = 0;
                                }
                                break;
                        }
                        if (!connp->conn_keepalive) {
                                /* Crank up the keepalive timer */
                                tcp->tcp_ka_last_intrvl = 0;
                                tcp->tcp_ka_tid = TCP_TIMER(tcp,
                                    tcp_keepalive_timer, tcp->tcp_ka_interval);
                                connp->conn_keepalive = 1;
                        }
                        break;
                case SO_SNDBUF: {
                        if (*i1 > tcps->tcps_max_buf) {
                                *outlenp = 0;
                                return (ENOBUFS);
                        }
                        if (checkonly)
                                break;

                        connp->conn_sndbuf = *i1;
                        if (tcps->tcps_snd_lowat_fraction != 0) {
                                connp->conn_sndlowat = connp->conn_sndbuf /
                                    tcps->tcps_snd_lowat_fraction;
                        }
                        (void) tcp_maxpsz_set(tcp, B_TRUE);
                        /*
                         * If we are flow-controlled, recheck the condition.
                         * There are apps that increase SO_SNDBUF size when
                         * flow-controlled (EWOULDBLOCK), and expect the flow
                         * control condition to be lifted right away.
                         */
                        mutex_enter(&tcp->tcp_non_sq_lock);
                        if (tcp->tcp_flow_stopped &&
                            TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) {
                                tcp_clrqfull(tcp);
                        }
                        mutex_exit(&tcp->tcp_non_sq_lock);
                        *outlenp = inlen;
                        return (0);
                }
                case SO_RCVBUF:
                        if (*i1 > tcps->tcps_max_buf) {
                                *outlenp = 0;
                                return (ENOBUFS);
                        }
                        /* Silently ignore zero */
                        if (!checkonly && *i1 != 0) {
                                *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss);
                                (void) tcp_rwnd_set(tcp, *i1);
                        }
                        /*
                         * XXX should we return the rwnd here
                         * and tcp_opt_get ?
                         */
                        *outlenp = inlen;
                        return (0);
                case SO_SND_COPYAVOID:
                        if (!checkonly) {
                                if (tcp->tcp_loopback ||
                                    (onoff != 1) || !tcp_zcopy_check(tcp)) {
                                        *outlenp = 0;
                                        return (EOPNOTSUPP);
                                }
                                tcp->tcp_snd_zcopy_aware = 1;
                        }
                        *outlenp = inlen;
                        return (0);
                }
                break;
        case IPPROTO_TCP:
                switch (name) {
                case TCP_NODELAY:
                        if (!checkonly)
                                tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss;
                        break;
                case TCP_NOTIFY_THRESHOLD:
                        if (!checkonly)
                                tcp->tcp_first_timer_threshold = *i1;
                        break;
                case TCP_ABORT_THRESHOLD:
                        if (!checkonly)
                                tcp->tcp_second_timer_threshold = *i1;
                        break;
                case TCP_CONN_NOTIFY_THRESHOLD:
                        if (!checkonly)
                                tcp->tcp_first_ctimer_threshold = *i1;
                        break;
                case TCP_CONN_ABORT_THRESHOLD:
                        if (!checkonly)
                                tcp->tcp_second_ctimer_threshold = *i1;
                        break;
                case TCP_RECVDSTADDR:
                        if (tcp->tcp_state > TCPS_LISTEN) {
                                *outlenp = 0;
                                return (EOPNOTSUPP);
                        }
                        /* Setting done in conn_opt_set */
                        break;
                case TCP_INIT_CWND:
                        if (checkonly)
                                break;

                        /*
                         * Only allow socket with network configuration
                         * privilege to set the initial cwnd to be larger
                         * than allowed by RFC 3390.
                         */
                        if (val > MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
                                if ((reterr = secpolicy_ip_config(cr, B_TRUE))
                                    != 0) {
                                        *outlenp = 0;
                                        return (reterr);
                                }
                                if (val > tcp_max_init_cwnd) {
                                        *outlenp = 0;
                                        return (EINVAL);
                                }
                        }

                        tcp->tcp_init_cwnd = val;

                        /*
                         * If the socket is connected, AND no outbound data
                         * has been sent, reset the actual cwnd values.
                         */
                        if (tcp->tcp_state == TCPS_ESTABLISHED &&
                            tcp->tcp_iss == tcp->tcp_snxt - 1) {
                                tcp->tcp_cwnd =
                                    MIN(tcp->tcp_rwnd, val * tcp->tcp_mss);
                        }
                        break;

                /*
                 * TCP_KEEPIDLE is in seconds but TCP_KEEPALIVE_THRESHOLD
                 * is in milliseconds. TCP_KEEPIDLE is introduced for
                 * compatibility with other Unix flavors.
                 * We can fall through TCP_KEEPALIVE_THRESHOLD logic after
                 * converting the input to milliseconds.
                 */
                case TCP_KEEPIDLE:
                        *i1 *= 1000;
                        /* FALLTHRU */

                case TCP_KEEPALIVE_THRESHOLD:
                        if (checkonly)
                                break;

                        if (*i1 < tcps->tcps_keepalive_interval_low ||
                            *i1 > tcps->tcps_keepalive_interval_high) {
                                *outlenp = 0;
                                return (EINVAL);
                        }
                        if (*i1 != tcp->tcp_ka_interval) {
                                tcp->tcp_ka_interval = *i1;
                                /*
                                 * Check if we need to restart the
                                 * keepalive timer.
                                 */
                                if (tcp->tcp_ka_tid != 0) {
                                        ASSERT(connp->conn_keepalive);
                                        (void) TCP_TIMER_CANCEL(tcp,
                                            tcp->tcp_ka_tid);
                                        tcp->tcp_ka_last_intrvl = 0;
                                        tcp->tcp_ka_tid = TCP_TIMER(tcp,
                                            tcp_keepalive_timer,
                                            tcp->tcp_ka_interval);
                                }
                        }
                        break;

                /*
                 * tcp_ka_abort_thres = tcp_ka_rinterval * tcp_ka_cnt.
                 * So setting TCP_KEEPCNT or TCP_KEEPINTVL can affect all the
                 * three members - tcp_ka_abort_thres, tcp_ka_rinterval and
                 * tcp_ka_cnt.
                 */
                case TCP_KEEPCNT:
                        if (checkonly)
                                break;

                        if (*i1 == 0) {
                                return (EINVAL);
                        } else if (tcp->tcp_ka_rinterval == 0) {
                                /*
                                 * When TCP_KEEPCNT is specified without first
                                 * specifying a TCP_KEEPINTVL, we infer an
                                 * interval based on a tunable specific to our
                                 * stack: the tcp_keepalive_abort_interval.
                                 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
                                 * the unlikely event that that has been set.)
                                 * Given the abort interval's default value of
                                 * 480 seconds, low TCP_KEEPCNT values can
                                 * result in intervals that exceed the default
                                 * maximum RTO of 60 seconds.  Rather than
                                 * fail in these cases, we (implicitly) clamp
                                 * the interval at the maximum RTO; if the
                                 * TCP_KEEPCNT is shortly followed by a
                                 * TCP_KEEPINTVL (as we expect), the abort
                                 * threshold will be recalculated correctly --
                                 * and if a TCP_KEEPINTVL is not forthcoming,
                                 * keep-alive will at least operate reasonably
                                 * given the underconfigured state.
                                 */
                                uint32_t interval;

                                interval = tcp->tcp_ka_abort_thres / *i1;

                                if (interval < tcp->tcp_rto_min)
                                        interval = tcp->tcp_rto_min;

                                if (interval > tcp->tcp_rto_max)
                                        interval = tcp->tcp_rto_max;

                                tcp->tcp_ka_rinterval = interval;
                        } else {
                                if ((*i1 * tcp->tcp_ka_rinterval) <
                                    tcps->tcps_keepalive_abort_interval_low ||
                                    (*i1 * tcp->tcp_ka_rinterval) >
                                    tcps->tcps_keepalive_abort_interval_high)
                                        return (EINVAL);
                                tcp->tcp_ka_abort_thres =
                                    (*i1 * tcp->tcp_ka_rinterval);
                        }
                        tcp->tcp_ka_cnt = *i1;
                        break;
                case TCP_KEEPINTVL:
                        /*
                         * TCP_KEEPINTVL is specified in seconds, but
                         * tcp_ka_rinterval is in milliseconds.
                         */

                        if (checkonly)
                                break;

                        if ((*i1 * 1000) < tcp->tcp_rto_min ||
                            (*i1 * 1000) > tcp->tcp_rto_max)
                                return (EINVAL);

                        if (tcp->tcp_ka_cnt == 0) {
                                tcp->tcp_ka_cnt =
                                    tcp->tcp_ka_abort_thres / (*i1 * 1000);
                        } else {
                                if ((*i1 * tcp->tcp_ka_cnt * 1000) <
                                    tcps->tcps_keepalive_abort_interval_low ||
                                    (*i1 * tcp->tcp_ka_cnt * 1000) >
                                    tcps->tcps_keepalive_abort_interval_high)
                                        return (EINVAL);
                                tcp->tcp_ka_abort_thres =
                                    (*i1 * tcp->tcp_ka_cnt * 1000);
                        }
                        tcp->tcp_ka_rinterval = *i1 * 1000;
                        break;
                case TCP_KEEPALIVE_ABORT_THRESHOLD:
                        if (!checkonly) {
                                if (*i1 <
                                    tcps->tcps_keepalive_abort_interval_low ||
                                    *i1 >
                                    tcps->tcps_keepalive_abort_interval_high) {
                                        *outlenp = 0;
                                        return (EINVAL);
                                }
                                tcp->tcp_ka_abort_thres = *i1;
                                tcp->tcp_ka_cnt = 0;
                                tcp->tcp_ka_rinterval = 0;
                        }
                        break;
                case TCP_CONGESTION: {
                        struct cc_algo *algo;

                        if (checkonly) {
                                break;
                        }

                        /*
                         * Make sure the string is NUL-terminated. Some
                         * consumers pass only the number of characters
                         * in the string, and don't include the NUL
                         * terminator, so we set it for them.
                         */
                        if (inlen < CC_ALGO_NAME_MAX) {
                                invalp[inlen] = '\0';
                        }
                        invalp[CC_ALGO_NAME_MAX - 1] = '\0';

                        if ((algo = cc_load_algo((char *)invalp)) == NULL) {
                                return (ENOENT);
                        }

                        if (CC_ALGO(tcp)->cb_destroy != NULL) {
                                CC_ALGO(tcp)->cb_destroy(&tcp->tcp_ccv);
                        }

                        CC_DATA(tcp) = NULL;
                        CC_ALGO(tcp) = algo;

                        if (CC_ALGO(tcp)->cb_init != NULL) {
                                VERIFY0(CC_ALGO(tcp)->cb_init(&tcp->tcp_ccv));
                        }

                        break;
                }
                case TCP_CORK:
                        if (!checkonly) {
                                /*
                                 * if tcp->tcp_cork was set and is now
                                 * being unset, we have to make sure that
                                 * the remaining data gets sent out. Also
                                 * unset tcp->tcp_cork so that tcp_wput_data()
                                 * can send data even if it is less than mss
                                 */
                                if (tcp->tcp_cork && onoff == 0 &&
                                    tcp->tcp_unsent > 0) {
                                        tcp->tcp_cork = B_FALSE;
                                        tcp_wput_data(tcp, NULL, B_FALSE);
                                }
                                tcp->tcp_cork = onoff;
                        }
                        break;
                case TCP_QUICKACK:
                        if (!checkonly) {
                                tcp->tcp_quickack = onoff;
                        }
                        break;
                case TCP_MD5SIG:
                        if (!checkonly) {
                                tcp->tcp_md5sig = onoff;
                        }
                        break;
                case TCP_RTO_INITIAL:
                        if (checkonly || val == 0)
                                break;

                        /*
                         * Sanity checks
                         *
                         * The initial RTO should be bounded by the minimum
                         * and maximum RTO.  And it should also be smaller
                         * than the connect attempt abort timeout.  Otherwise,
                         * the connection won't be aborted in a period
                         * reasonably close to that timeout.
                         */
                        if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
                            val > tcp->tcp_second_ctimer_threshold ||
                            val < tcps->tcps_rexmit_interval_initial_low ||
                            val > tcps->tcps_rexmit_interval_initial_high) {
                                *outlenp = 0;
                                return (EINVAL);
                        }
                        tcp->tcp_rto_initial = val;

                        /*
                         * If TCP has not sent anything, need to re-calculate
                         * tcp_rto.  Otherwise, this option change does not
                         * really affect anything.
                         */
                        if (tcp->tcp_state >= TCPS_SYN_SENT)
                                break;

                        tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
                        tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
                        tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
                            tcps->tcps_conn_grace_period);
                        break;
                case TCP_RTO_MIN:
                        if (checkonly || val == 0)
                                break;

                        if (val < tcps->tcps_rexmit_interval_min_low ||
                            val > tcps->tcps_rexmit_interval_min_high ||
                            val > tcp->tcp_rto_max) {
                                *outlenp = 0;
                                return (EINVAL);
                        }
                        tcp->tcp_rto_min = val;
                        if (tcp->tcp_rto < val)
                                tcp->tcp_rto = val;
                        break;
                case TCP_RTO_MAX:
                        if (checkonly || val == 0)
                                break;

                        /*
                         * Sanity checks
                         *
                         * The maximum RTO should not be larger than the
                         * connection abort timeout.  Otherwise, the
                         * connection won't be aborted in a period reasonably
                         * close to that timeout.
                         */
                        if (val < tcps->tcps_rexmit_interval_max_low ||
                            val > tcps->tcps_rexmit_interval_max_high ||
                            val < tcp->tcp_rto_min ||
                            val > tcp->tcp_second_timer_threshold) {
                                *outlenp = 0;
                                return (EINVAL);
                        }
                        tcp->tcp_rto_max = val;
                        if (tcp->tcp_rto > val)
                                tcp->tcp_rto = val;
                        break;
                case TCP_LINGER2:
                        if (checkonly || *i1 == 0)
                                break;

                        /*
                         * Note that the option value's unit is second.  And
                         * the value should be bigger than the private
                         * parameter tcp_fin_wait_2_flush_interval's lower
                         * bound and smaller than the current value of that
                         * parameter.  It should be smaller than the current
                         * value to avoid an app setting TCP_LINGER2 to a big
                         * value, causing resource to be held up too long in
                         * FIN-WAIT-2 state.
                         */
                        if (*i1 < 0 ||
                            tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
                            *i1 ||
                            tcps->tcps_fin_wait_2_flush_interval/SECONDS <
                            *i1) {
                                *outlenp = 0;
                                return (EINVAL);
                        }
                        tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
                        break;
                default:
                        break;
                }
                break;
        case IPPROTO_IP:
                if (connp->conn_family != AF_INET) {
                        *outlenp = 0;
                        return (EINVAL);
                }
                switch (name) {
                case IP_SEC_OPT:
                        /*
                         * We should not allow policy setting after
                         * we start listening for connections.
                         */
                        if (tcp->tcp_state == TCPS_LISTEN) {
                                return (EINVAL);
                        }
                        break;
                case IP_RECVTOS:
                        if (!checkonly) {
                                /*
                                 * Force it to be sent up with the next msg
                                 * by setting it to a value which cannot
                                 * appear in a packet (TOS is only 8-bits)
                                 */
                                tcp->tcp_recvtos = 0xffffffffU;
                        }
                        break;
                }
                break;
        case IPPROTO_IPV6:
                /*
                 * IPPROTO_IPV6 options are only supported for sockets
                 * that are using IPv6 on the wire.
                 */
                if (connp->conn_ipversion != IPV6_VERSION) {
                        *outlenp = 0;
                        return (EINVAL);
                }

                switch (name) {
                case IPV6_RECVPKTINFO:
                        if (!checkonly) {
                                /* Force it to be sent up with the next msg */
                                tcp->tcp_recvifindex = 0;
                        }
                        break;
                case IPV6_RECVTCLASS:
                        if (!checkonly) {
                                /* Force it to be sent up with the next msg */
                                tcp->tcp_recvtclass = 0xffffffffU;
                        }
                        break;
                case IPV6_RECVHOPLIMIT:
                        if (!checkonly) {
                                /* Force it to be sent up with the next msg */
                                tcp->tcp_recvhops = 0xffffffffU;
                        }
                        break;
                case IPV6_PKTINFO:
                        /* This is an extra check for TCP */
                        if (inlen == sizeof (struct in6_pktinfo)) {
                                struct in6_pktinfo *pkti;

                                pkti = (struct in6_pktinfo *)invalp;
                                /*
                                 * RFC 3542 states that ipi6_addr must be
                                 * the unspecified address when setting the
                                 * IPV6_PKTINFO sticky socket option on a
                                 * TCP socket.
                                 */
                                if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr))
                                        return (EINVAL);
                        }
                        break;
                case IPV6_SEC_OPT:
                        /*
                         * We should not allow policy setting after
                         * we start listening for connections.
                         */
                        if (tcp->tcp_state == TCPS_LISTEN) {
                                return (EINVAL);
                        }
                        break;
                }
                break;
        }
        reterr = conn_opt_set(&coas, level, name, inlen, invalp,
            checkonly, cr);
        if (reterr != 0) {
                *outlenp = 0;
                return (reterr);
        }

        /*
         * Common case of OK return with outval same as inval
         */
        if (invalp != outvalp) {
                /* don't trust bcopy for identical src/dst */
                (void) bcopy(invalp, outvalp, inlen);
        }
        *outlenp = inlen;

        if (coas.coa_changed & COA_HEADER_CHANGED) {
                /* If we are connected we rebuilt the headers */
                if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
                    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
                        reterr = tcp_build_hdrs(tcp);
                        if (reterr != 0)
                                return (reterr);
                }
        }
        if (coas.coa_changed & COA_ROUTE_CHANGED) {
                in6_addr_t nexthop;

                /*
                 * If we are connected we re-cache the information.
                 * We ignore errors to preserve BSD behavior.
                 * Note that we don't redo IPsec policy lookup here
                 * since the final destination (or source) didn't change.
                 */
                ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa,
                    &connp->conn_faddr_v6, &nexthop);

                if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) &&
                    !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
                        (void) ip_attr_connect(connp, connp->conn_ixa,
                            &connp->conn_laddr_v6, &connp->conn_faddr_v6,
                            &nexthop, connp->conn_fport, NULL, NULL,
                            IPDF_VERIFY_DST);
                }
        }
        if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) {
                connp->conn_wq->q_hiwat = connp->conn_sndbuf;
        }
        if (coas.coa_changed & COA_WROFF_CHANGED) {
                connp->conn_wroff = connp->conn_ht_iphc_allocated +
                    tcps->tcps_wroff_xtra;
                (void) proto_set_tx_wroff(connp->conn_rq, connp,
                    connp->conn_wroff);
        }
        if (coas.coa_changed & COA_OOBINLINE_CHANGED) {
                if (IPCL_IS_NONSTR(connp))
                        proto_set_rx_oob_opt(connp, onoff);
        }
        return (0);
}