root/usr/src/uts/common/fs/sockfs/sockcommon.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2015, Joyent, Inc.
 * Copyright 2017 Sebastian Wiedenroth
 * Copyright 2022 Garrett D'Amore
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/debug.h>
#include <sys/cmn_err.h>
#include <sys/vfs.h>
#include <sys/policy.h>
#include <sys/modctl.h>

#include <sys/sunddi.h>

#include <sys/strsun.h>
#include <sys/stropts.h>
#include <sys/strsubr.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/uio.h>

#include <inet/ipclassifier.h>
#include <fs/sockfs/sockcommon.h>
#include <fs/sockfs/sockfilter_impl.h>
#include <fs/sockfs/socktpi.h>
#include <fs/sockfs/sodirect.h>
#include <inet/ip.h>

extern int xnet_skip_checks, xnet_check_print, xnet_truncate_print;

/*
 * Common socket access functions.
 *
 * Instead of accessing the sonode switch directly (i.e., SOP_xxx()),
 * the socket_xxx() function should be used.
 */

/*
 * Try to create a new sonode of the requested <family, type, protocol>.
 */
/* ARGSUSED */
struct sonode *
socket_create(int family, int type, int protocol, char *devpath, char *mod,
    int flags, int version, struct cred *cr, int *errorp)
{
        struct sonode *so;
        struct sockparams *sp = NULL;
        int saved_error;

        /*
         * Look for a sockparams entry that match the given criteria.
         * solookup() returns with the entry held.
         */
        *errorp = solookup(family, type, protocol, &sp);
        saved_error = *errorp;
        if (sp == NULL) {
                int kmflags = (flags == SOCKET_SLEEP) ? KM_SLEEP : KM_NOSLEEP;
                /*
                 * There is no matching sockparams entry. An ephemeral entry is
                 * created if the caller specifies a device or a socket module.
                 */
                if (devpath != NULL) {
                        saved_error = 0;
                        sp = sockparams_hold_ephemeral_bydev(family, type,
                            protocol, devpath, kmflags, errorp);
                } else if (mod != NULL) {
                        saved_error = 0;
                        sp = sockparams_hold_ephemeral_bymod(family, type,
                            protocol, mod, kmflags, errorp);
                } else {
                        *errorp = solookup(family, type, 0, &sp);
                }

                if (sp == NULL) {
                        if (saved_error && (*errorp == EPROTONOSUPPORT ||
                            *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
                                *errorp = saved_error;
                        return (NULL);
                }
        }

        ASSERT(sp->sp_smod_info != NULL);
        ASSERT(flags == SOCKET_SLEEP || flags == SOCKET_NOSLEEP);
        sp->sp_stats.sps_ncreate.value.ui64++;
        so = sp->sp_smod_info->smod_sock_create_func(sp, family, type,
            protocol, version, flags, errorp, cr);
        if (so == NULL) {
                SOCKPARAMS_DEC_REF(sp);
        } else {
                if ((*errorp = SOP_INIT(so, NULL, cr, flags)) == 0) {
                        /* Cannot fail, only bumps so_count */
                        (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
                } else {
                        if (saved_error && (*errorp == EPROTONOSUPPORT ||
                            *errorp == EPROTOTYPE || *errorp == ENOPROTOOPT))
                                *errorp = saved_error;
                        socket_destroy(so);
                        so = NULL;
                }
        }
        return (so);
}

struct sonode *
socket_newconn(struct sonode *parent, sock_lower_handle_t lh,
    sock_downcalls_t *dc, int flags, int *errorp)
{
        struct sonode *so;
        struct sockparams *sp;
        struct cred *cr;

        if ((cr = CRED()) == NULL)
                cr = kcred;

        sp = parent->so_sockparams;
        ASSERT(sp != NULL);

        sp->sp_stats.sps_ncreate.value.ui64++;
        so = sp->sp_smod_info->smod_sock_create_func(sp, parent->so_family,
            parent->so_type, parent->so_protocol, parent->so_version, flags,
            errorp, cr);
        if (so != NULL) {
                SOCKPARAMS_INC_REF(sp);

                so->so_proto_handle = lh;
                so->so_downcalls = dc;
                /*
                 * This function may be called in interrupt context, and CRED()
                 * will be NULL. In this case, pass in kcred.
                 */
                if ((*errorp = SOP_INIT(so, parent, cr, flags)) == 0) {
                        /* Cannot fail, only bumps so_count */
                        (void) VOP_OPEN(&SOTOV(so), FREAD|FWRITE, cr, NULL);
                } else  {
                        socket_destroy(so);
                        so = NULL;
                }
        }

        return (so);
}

/*
 * Bind local endpoint.
 */
int
socket_bind(struct sonode *so, struct sockaddr *name, socklen_t namelen,
    int flags, cred_t *cr)
{
        return (SOP_BIND(so, name, namelen, flags, cr));
}

/*
 * Turn socket into a listen socket.
 */
int
socket_listen(struct sonode *so, int backlog, cred_t *cr)
{
        if (backlog < 0) {
                backlog = 0;
        }

        /*
         * Use the same qlimit as in BSD. BSD checks the qlimit
         * before queuing the next connection implying that a
         * listen(sock, 0) allows one connection to be queued.
         * BSD also uses 1.5 times the requested backlog.
         *
         * XNS Issue 4 required a strict interpretation of the backlog.
         * This has been waived subsequently for Issue 4 and the change
         * incorporated in XNS Issue 5. So we aren't required to do
         * anything special for XPG apps.
         */
        if (backlog >= (INT_MAX - 1) / 3)
                backlog = INT_MAX;
        else
                backlog = backlog * 3 / 2 + 1;

        return (SOP_LISTEN(so, backlog, cr));
}

/*
 * Accept incoming connection.
 */
int
socket_accept(struct sonode *lso, int fflag, cred_t *cr, struct sonode **nsop)
{
        return (SOP_ACCEPT(lso, fflag, cr, nsop));
}

/*
 * Active open.
 */
int
socket_connect(struct sonode *so, struct sockaddr *name,
    socklen_t namelen, int fflag, int flags, cred_t *cr)
{
        int error;

        /*
         * Handle a connect to a name parameter of type AF_UNSPEC like a
         * connect to a null address. This is the portable method to
         * unconnect a socket.
         */
        if ((namelen >= sizeof (sa_family_t)) &&
            (name->sa_family == AF_UNSPEC)) {
                name = NULL;
                namelen = 0;
        }

        error = SOP_CONNECT(so, name, namelen, fflag, flags, cr);

        if (error == EHOSTUNREACH && flags & _SOCONNECT_XPG4_2) {
                /*
                 * X/Open specification contains a requirement that
                 * ENETUNREACH be returned but does not require
                 * EHOSTUNREACH. In order to keep the test suite
                 * happy we mess with the errno here.
                 */
                error = ENETUNREACH;
        }

        return (error);
}

/*
 * Get address of remote node.
 */
int
socket_getpeername(struct sonode *so, struct sockaddr *addr,
    socklen_t *addrlen, boolean_t accept, cred_t *cr)
{
        ASSERT(*addrlen > 0);
        return (SOP_GETPEERNAME(so, addr, addrlen, accept, cr));

}

/*
 * Get local address.
 */
int
socket_getsockname(struct sonode *so, struct sockaddr *addr,
    socklen_t *addrlen, cred_t *cr)
{
        return (SOP_GETSOCKNAME(so, addr, addrlen, cr));

}

/*
 * Called from shutdown().
 */
int
socket_shutdown(struct sonode *so, int how, cred_t *cr)
{
        return (SOP_SHUTDOWN(so, how, cr));
}

/*
 * Get socket options.
 */
/*ARGSUSED*/
int
socket_getsockopt(struct sonode *so, int level, int option_name,
    void *optval, socklen_t *optlenp, int flags, cred_t *cr)
{
        return (SOP_GETSOCKOPT(so, level, option_name, optval,
            optlenp, flags, cr));
}

/*
 * Set socket options
 */
int
socket_setsockopt(struct sonode *so, int level, int option_name,
    const void *optval, t_uscalar_t optlen, cred_t *cr)
{
        int val = 1;
        /* Caller allocates aligned optval, or passes null */
        ASSERT(((uintptr_t)optval & (sizeof (t_scalar_t) - 1)) == 0);
        /* If optval is null optlen is 0, and vice-versa */
        ASSERT(optval != NULL || optlen == 0);
        ASSERT(optlen != 0 || optval == NULL);

        if (optval == NULL && optlen == 0)
                optval = &val;

        return (SOP_SETSOCKOPT(so, level, option_name, optval, optlen, cr));
}

int
socket_sendmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
    cred_t *cr)
{
        int error = 0;
        ssize_t orig_resid = uiop->uio_resid;

        /*
         * Do not bypass the cache if we are doing a local (AF_UNIX) write.
         */
        if (so->so_family == AF_UNIX)
                uiop->uio_extflg |= UIO_COPY_CACHED;
        else
                uiop->uio_extflg &= ~UIO_COPY_CACHED;

        error = SOP_SENDMSG(so, msg, uiop, cr);
        switch (error) {
        default:
                break;
        case EINTR:
        case ENOMEM:
        /* EAGAIN is EWOULDBLOCK */
        case EWOULDBLOCK:
                /* We did a partial send */
                if (uiop->uio_resid != orig_resid)
                        error = 0;
                break;
        case EPIPE:
                if (((so->so_mode & SM_KERNEL) == 0) &&
                    ((msg->msg_flags & MSG_NOSIGNAL) == 0)) {
                        tsignal(curthread, SIGPIPE);
                }
                break;
        }

        return (error);
}

int
socket_sendmblk(struct sonode *so, struct nmsghdr *msg, int fflag,
    struct cred *cr, mblk_t **mpp)
{
        int error = 0;

        error = SOP_SENDMBLK(so, msg, fflag, cr, mpp);
        if (error == EPIPE) {
                tsignal(curthread, SIGPIPE);
        }
        return (error);
}

int
socket_recvmsg(struct sonode *so, struct nmsghdr *msg, struct uio *uiop,
    cred_t *cr)
{
        int error;
        ssize_t orig_resid = uiop->uio_resid;

        /*
         * Do not bypass the cache when reading data, as the application
         * is likely to access the data shortly.
         */
        uiop->uio_extflg |= UIO_COPY_CACHED;

        error = SOP_RECVMSG(so, msg, uiop, cr);

        switch (error) {
        case EINTR:
        /* EAGAIN is EWOULDBLOCK */
        case EWOULDBLOCK:
                /* We did a partial read */
                if (uiop->uio_resid != orig_resid)
                        error = 0;
                break;
        default:
                break;
        }
        return (error);
}

int
socket_ioctl(struct sonode *so, int cmd, intptr_t arg, int mode,
    struct cred *cr, int32_t *rvalp)
{
        return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
}

int
socket_poll(struct sonode *so, short events, int anyyet, short *reventsp,
    struct pollhead **phpp)
{
        return (SOP_POLL(so, events, anyyet, reventsp, phpp));
}

int
socket_close(struct sonode *so, int flag, struct cred *cr)
{
        return (VOP_CLOSE(SOTOV(so), flag, 1, 0, cr, NULL));
}

int
socket_close_internal(struct sonode *so, int flag, cred_t *cr)
{
        ASSERT(so->so_count == 0);

        return (SOP_CLOSE(so, flag, cr));
}

void
socket_destroy(struct sonode *so)
{
        vn_invalid(SOTOV(so));
        VN_RELE(SOTOV(so));
}

/* ARGSUSED */
void
socket_destroy_internal(struct sonode *so, cred_t *cr)
{
        struct sockparams *sp = so->so_sockparams;
        ASSERT(so->so_count == 0 && sp != NULL);

        sp->sp_smod_info->smod_sock_destroy_func(so);

        SOCKPARAMS_DEC_REF(sp);
}

/*
 * TODO Once the common vnode ops is available, then the vnops argument
 * should be removed.
 */
/*ARGSUSED*/
int
sonode_constructor(void *buf, void *cdrarg, int kmflags)
{
        struct sonode *so = buf;
        struct vnode *vp;

        vp = so->so_vnode = vn_alloc(kmflags);
        if (vp == NULL) {
                return (-1);
        }
        vp->v_data = so;
        vn_setops(vp, socket_vnodeops);

        so->so_priv             = NULL;
        so->so_oobmsg           = NULL;

        so->so_proto_handle     = NULL;

        so->so_peercred         = NULL;

        so->so_rcv_queued       = 0;
        so->so_rcv_q_head       = NULL;
        so->so_rcv_q_last_head  = NULL;
        so->so_rcv_head         = NULL;
        so->so_rcv_last_head    = NULL;
        so->so_rcv_wanted       = 0;
        so->so_rcv_timer_interval = SOCKET_NO_RCVTIMER;
        so->so_rcv_timer_tid    = 0;
        so->so_rcv_thresh       = 0;

        list_create(&so->so_acceptq_list, sizeof (struct sonode),
            offsetof(struct sonode, so_acceptq_node));
        list_create(&so->so_acceptq_defer, sizeof (struct sonode),
            offsetof(struct sonode, so_acceptq_node));
        list_link_init(&so->so_acceptq_node);
        so->so_acceptq_len      = 0;
        so->so_backlog          = 0;
        so->so_listener         = NULL;

        so->so_snd_qfull        = B_FALSE;

        so->so_filter_active    = 0;
        so->so_filter_tx        = 0;
        so->so_filter_defertime = 0;
        so->so_filter_top       = NULL;
        so->so_filter_bottom    = NULL;

        mutex_init(&so->so_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&so->so_acceptq_lock, NULL, MUTEX_DEFAULT, NULL);
        rw_init(&so->so_fallback_rwlock, NULL, RW_DEFAULT, NULL);
        cv_init(&so->so_state_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&so->so_single_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&so->so_read_cv, NULL, CV_DEFAULT, NULL);

        cv_init(&so->so_acceptq_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&so->so_snd_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&so->so_rcv_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&so->so_copy_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&so->so_closing_cv, NULL, CV_DEFAULT, NULL);

        so->so_krecv_cb = NULL;
        so->so_krecv_arg = NULL;

        return (0);
}

/*ARGSUSED*/
void
sonode_destructor(void *buf, void *cdrarg)
{
        struct sonode *so = buf;
        struct vnode *vp = SOTOV(so);

        ASSERT(so->so_priv == NULL);
        ASSERT(so->so_peercred == NULL);

        ASSERT(so->so_oobmsg == NULL);

        ASSERT(so->so_rcv_q_head == NULL);

        list_destroy(&so->so_acceptq_list);
        list_destroy(&so->so_acceptq_defer);
        ASSERT(!list_link_active(&so->so_acceptq_node));
        ASSERT(so->so_listener == NULL);

        ASSERT(so->so_filter_active == 0);
        ASSERT(so->so_filter_tx == 0);
        ASSERT(so->so_filter_top == NULL);
        ASSERT(so->so_filter_bottom == NULL);

        ASSERT(vp->v_data == so);
        ASSERT(vn_matchops(vp, socket_vnodeops));

        vn_free(vp);

        mutex_destroy(&so->so_lock);
        mutex_destroy(&so->so_acceptq_lock);
        rw_destroy(&so->so_fallback_rwlock);

        cv_destroy(&so->so_state_cv);
        cv_destroy(&so->so_single_cv);
        cv_destroy(&so->so_read_cv);
        cv_destroy(&so->so_acceptq_cv);
        cv_destroy(&so->so_snd_cv);
        cv_destroy(&so->so_rcv_cv);
        cv_destroy(&so->so_closing_cv);
}

void
sonode_init(struct sonode *so, struct sockparams *sp, int family,
    int type, int protocol, sonodeops_t *sops)
{
        vnode_t *vp;

        vp = SOTOV(so);

        so->so_flag     = 0;

        so->so_state    = 0;
        so->so_mode     = 0;

        so->so_count    = 0;

        so->so_family   = family;
        so->so_type     = type;
        so->so_protocol = protocol;

        SOCK_CONNID_INIT(so->so_proto_connid);

        so->so_options  = 0;
        so->so_linger.l_onoff   = 0;
        so->so_linger.l_linger = 0;
        so->so_sndbuf   = 0;
        so->so_error    = 0;
        so->so_rcvtimeo = 0;
        so->so_sndtimeo = 0;
        so->so_xpg_rcvbuf = 0;

        ASSERT(so->so_oobmsg == NULL);
        so->so_oobmark  = 0;
        so->so_pgrp     = 0;

        ASSERT(so->so_peercred == NULL);

        so->so_zoneid = getzoneid();

        so->so_sockparams = sp;

        so->so_ops = sops;

        so->so_not_str = (sops != &sotpi_sonodeops);

        so->so_proto_handle = NULL;

        so->so_downcalls = NULL;

        so->so_copyflag = 0;

        vn_reinit(vp);
        vp->v_vfsp      = sock_vfsp;
        vp->v_type      = VSOCK;
        vp->v_rdev      = sockdev;

        so->so_snd_qfull = B_FALSE;
        so->so_minpsz = 0;

        so->so_rcv_wakeup = B_FALSE;
        so->so_snd_wakeup = B_FALSE;
        so->so_flowctrld = B_FALSE;

        so->so_pollev = 0;
        bzero(&so->so_poll_list, sizeof (so->so_poll_list));
        bzero(&so->so_proto_props, sizeof (struct sock_proto_props));

        bzero(&(so->so_ksock_callbacks), sizeof (ksocket_callbacks_t));
        so->so_ksock_cb_arg = NULL;

        so->so_max_addr_len = sizeof (struct sockaddr_storage);

        so->so_direct = NULL;

        vn_exists(vp);
}

void
sonode_fini(struct sonode *so)
{
        vnode_t *vp;

        ASSERT(so->so_count == 0);

        if (so->so_rcv_timer_tid) {
                ASSERT(MUTEX_NOT_HELD(&so->so_lock));
                (void) untimeout(so->so_rcv_timer_tid);
                so->so_rcv_timer_tid = 0;
        }

        if (so->so_poll_list.ph_list != NULL) {
                pollwakeup(&so->so_poll_list, POLLERR);
                pollhead_clean(&so->so_poll_list);
        }

        if (so->so_direct != NULL)
                sod_sock_fini(so);

        vp = SOTOV(so);
        vn_invalid(vp);

        if (so->so_peercred != NULL) {
                crfree(so->so_peercred);
                so->so_peercred = NULL;
        }
        /* Detach and destroy filters */
        if (so->so_filter_top != NULL)
                sof_sonode_cleanup(so);

        /* Clean up any remnants of krecv callbacks */
        so->so_krecv_cb = NULL;
        so->so_krecv_arg = NULL;

        ASSERT(list_is_empty(&so->so_acceptq_list));
        ASSERT(list_is_empty(&so->so_acceptq_defer));
        ASSERT(!list_link_active(&so->so_acceptq_node));

        ASSERT(so->so_rcv_queued == 0);
        ASSERT(so->so_rcv_q_head == NULL);
        ASSERT(so->so_rcv_q_last_head == NULL);
        ASSERT(so->so_rcv_head == NULL);
        ASSERT(so->so_rcv_last_head == NULL);
}