root/usr/src/uts/common/fs/fd/fdops.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2017 by Delphix. All rights reserved.
 */

/*      Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/*        All rights reserved.          */


#include <sys/types.h>
#include <sys/param.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/file.h>
#include <sys/inline.h>
#include <sys/kmem.h>
#include <sys/pathname.h>
#include <sys/resource.h>
#include <sys/statvfs.h>
#include <sys/mount.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/uio.h>
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/vnode.h>
#include <sys/cred.h>
#include <sys/mntent.h>
#include <sys/mount.h>
#include <sys/user.h>
#include <sys/t_lock.h>
#include <sys/modctl.h>
#include <sys/policy.h>
#include <fs/fs_subr.h>
#include <sys/atomic.h>
#include <sys/mkdev.h>

#define round(r)        (((r)+sizeof (int)-1)&(~(sizeof (int)-1)))
#define fdtoi(n)        ((n)+100)

#define FDDIRSIZE 14
struct fddirect {
        short   d_ino;
        char    d_name[FDDIRSIZE];
};

#define FDROOTINO       2
#define FDSDSIZE        sizeof (struct fddirect)
#define FDNSIZE         10

static int              fdfstype = 0;
static major_t          fdfsmaj;
static minor_t          fdfsmin;
static major_t          fdrmaj;
static kmutex_t         fd_minor_lock;

static int fdget(vnode_t *, char *, vnode_t **);

/* ARGSUSED */
static int
fdopen(vnode_t **vpp, int mode, cred_t *cr, caller_context_t *ct)
{
        if ((*vpp)->v_type != VDIR) {
                mutex_enter(&(*vpp)->v_lock);
                (*vpp)->v_flag |= VDUP;
                mutex_exit(&(*vpp)->v_lock);
        }
        return (0);
}

/* ARGSUSED */
static int
fdclose(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
    caller_context_t *ct)
{
        return (0);
}

/* ARGSUSED */
static int
fdread(vnode_t *vp, uio_t *uiop, int ioflag, cred_t *cr, caller_context_t *ct)
{
        static struct fddirect dotbuf[] = {
                { FDROOTINO, "."  },
                { FDROOTINO, ".." }
        };
        struct fddirect dirbuf;
        int i, n;
        int minfd, maxfd, modoff, error = 0;
        int nentries;
        rctl_qty_t fdno_ctl;
        int endoff;

        if (vp->v_type != VDIR)
                return (ENOSYS);

        mutex_enter(&curproc->p_lock);
        fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
            curproc->p_rctls, curproc);
        nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
        mutex_exit(&curproc->p_lock);

        endoff = (nentries + 2) * FDSDSIZE;

        /*
         * Fake up ".", "..", and the /dev/fd directory entries.
         */
        if (uiop->uio_loffset < (offset_t)0 ||
            uiop->uio_loffset >= (offset_t)endoff ||
            uiop->uio_resid <= 0)
                return (0);
        ASSERT(uiop->uio_loffset <= MAXOFF_T);
        if (uiop->uio_offset < 2*FDSDSIZE) {
                error = uiomove((caddr_t)dotbuf + uiop->uio_offset,
                    MIN(uiop->uio_resid, 2*FDSDSIZE - uiop->uio_offset),
                    UIO_READ, uiop);
                if (uiop->uio_resid <= 0 || error)
                        return (error);
        }
        minfd = (uiop->uio_offset - 2*FDSDSIZE)/FDSDSIZE;
        maxfd = (uiop->uio_offset + uiop->uio_resid - 1)/FDSDSIZE;
        modoff = uiop->uio_offset % FDSDSIZE;

        for (i = 0; i < FDDIRSIZE; i++)
                dirbuf.d_name[i] = '\0';
        for (i = minfd; i < MIN(maxfd, nentries); i++) {
                n = i;
                dirbuf.d_ino = fdtoi(n);
                numtos((ulong_t)n, dirbuf.d_name);
                error = uiomove((caddr_t)&dirbuf + modoff,
                    MIN(uiop->uio_resid, FDSDSIZE - modoff),
                    UIO_READ, uiop);
                if (uiop->uio_resid <= 0 || error)
                        return (error);
                modoff = 0;
        }

        return (error);
}

/* ARGSUSED */
static int
fdgetattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
    caller_context_t *ct)
{
        vfs_t *vfsp = vp->v_vfsp;
        timestruc_t now;

        if (vp->v_type == VDIR) {
                vap->va_nlink = 2;
                vap->va_size = (u_offset_t)
                    ((P_FINFO(curproc)->fi_nfiles + 2) * FDSDSIZE);
                vap->va_mode = 0555;
                vap->va_nodeid = (ino64_t)FDROOTINO;
        } else {
                vap->va_nlink = 1;
                vap->va_size = (u_offset_t)0;
                vap->va_mode = 0666;
                vap->va_nodeid = (ino64_t)fdtoi(getminor(vp->v_rdev));
        }
        vap->va_type = vp->v_type;
        vap->va_rdev = vp->v_rdev;
        vap->va_blksize = vfsp->vfs_bsize;
        vap->va_nblocks = (fsblkcnt64_t)0;
        gethrestime(&now);
        vap->va_atime = vap->va_mtime = vap->va_ctime = now;
        vap->va_uid = 0;
        vap->va_gid = 0;
        vap->va_fsid = vfsp->vfs_dev;
        vap->va_seq = 0;
        return (0);
}

/* ARGSUSED */
static int
fdaccess(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
{
        return (0);
}

/* ARGSUSED */
static int
fdlookup(vnode_t *dp, char *comp, vnode_t **vpp, pathname_t *pnp, int flags,
    vnode_t *rdir, cred_t *cr, caller_context_t *ct, int *direntflags,
    pathname_t *realpnp)
{
        if (comp[0] == 0 || strcmp(comp, ".") == 0 || strcmp(comp, "..") == 0) {
                VN_HOLD(dp);
                *vpp = dp;
                return (0);
        }
        return (fdget(dp, comp, vpp));
}

/* ARGSUSED */
static int
fdcreate(vnode_t *dvp, char *comp, vattr_t *vap, enum vcexcl excl, int mode,
    vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
    vsecattr_t *vsecp)
{
        return (fdget(dvp, comp, vpp));
}

/* ARGSUSED */
static int
fdreaddir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, caller_context_t *ct,
    int flags)
{
        /* bp holds one dirent structure */
        u_offset_t bp[DIRENT64_RECLEN(FDNSIZE) / sizeof (u_offset_t)];
        struct dirent64 *dirent = (struct dirent64 *)bp;
        int reclen, nentries;
        rctl_qty_t fdno_ctl;
        int  n;
        int oresid;
        off_t off;

        if (uiop->uio_offset < 0 || uiop->uio_resid <= 0 ||
            (uiop->uio_offset % FDSDSIZE) != 0)
                return (ENOENT);

        ASSERT(uiop->uio_loffset <= MAXOFF_T);
        oresid = uiop->uio_resid;
        bzero(bp, sizeof (bp));

        mutex_enter(&curproc->p_lock);
        fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
            curproc->p_rctls, curproc);
        nentries = MIN(P_FINFO(curproc)->fi_nfiles, (int)fdno_ctl);
        mutex_exit(&curproc->p_lock);

        while (uiop->uio_resid > 0) {
                if ((off = uiop->uio_offset) == 0) {    /* "." */
                        dirent->d_ino = (ino64_t)FDROOTINO;
                        dirent->d_name[0] = '.';
                        dirent->d_name[1] = '\0';
                        reclen = DIRENT64_RECLEN(1);
                } else if (off == FDSDSIZE) {           /* ".." */
                        dirent->d_ino = (ino64_t)FDROOTINO;
                        dirent->d_name[0] = '.';
                        dirent->d_name[1] = '.';
                        dirent->d_name[2] = '\0';
                        reclen = DIRENT64_RECLEN(2);
                } else {
                        /*
                         * Return entries corresponding to the allowable
                         * number of file descriptors for this process.
                         */
                        if ((n = (off-2*FDSDSIZE)/FDSDSIZE) >= nentries)
                                break;
                        dirent->d_ino = (ino64_t)fdtoi(n);
                        numtos((ulong_t)n, dirent->d_name);
                        reclen = DIRENT64_RECLEN(strlen(dirent->d_name));
                }
                dirent->d_off = (offset_t)(uiop->uio_offset + FDSDSIZE);
                dirent->d_reclen = (ushort_t)reclen;

                if (reclen > uiop->uio_resid) {
                        /*
                         * Error if no entries have been returned yet.
                         */
                        if (uiop->uio_resid == oresid)
                                return (EINVAL);
                        break;
                }
                /*
                 * uiomove() updates both resid and offset by the same
                 * amount.  But we want offset to change in increments
                 * of FDSDSIZE, which is different from the number of bytes
                 * being returned to the user.  So we set uio_offset
                 * separately, ignoring what uiomove() does.
                 */
                if (uiomove((caddr_t)dirent, reclen, UIO_READ, uiop))
                        return (EFAULT);
                uiop->uio_offset = off + FDSDSIZE;
        }
        if (eofp)
                *eofp = ((uiop->uio_offset-2*FDSDSIZE)/FDSDSIZE >= nentries);
        return (0);
}

/* ARGSUSED */
static void
fdinactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
{
        mutex_enter(&vp->v_lock);
        ASSERT(vp->v_count >= 1);
        VN_RELE_LOCKED(vp);
        if (vp->v_count != 0) {
                mutex_exit(&vp->v_lock);
                return;
        }
        mutex_exit(&vp->v_lock);
        vn_invalid(vp);
        vn_free(vp);
}

static struct vnodeops *fd_vnodeops;

static const fs_operation_def_t fd_vnodeops_template[] = {
        VOPNAME_OPEN,           { .vop_open = fdopen },
        VOPNAME_CLOSE,          { .vop_close = fdclose },
        VOPNAME_READ,           { .vop_read = fdread },
        VOPNAME_GETATTR,        { .vop_getattr = fdgetattr },
        VOPNAME_ACCESS,         { .vop_access = fdaccess },
        VOPNAME_LOOKUP,         { .vop_lookup = fdlookup },
        VOPNAME_CREATE,         { .vop_create = fdcreate },
        VOPNAME_READDIR,        { .vop_readdir = fdreaddir },
        VOPNAME_INACTIVE,       { .vop_inactive = fdinactive },
        VOPNAME_FRLOCK,         { .error = fs_error },
        VOPNAME_POLL,           { .error = fs_error },
        VOPNAME_DISPOSE,        { .error = fs_error },
        NULL,                   NULL
};

static int
fdget(struct vnode *dvp, char *comp, struct vnode **vpp)
{
        int n = 0;
        struct vnode *vp;

        while (*comp) {
                if (*comp < '0' || *comp > '9')
                        return (ENOENT);
                n = 10 * n + *comp++ - '0';
        }
        vp = vn_alloc(KM_SLEEP);
        vp->v_type = VCHR;
        vp->v_vfsp = dvp->v_vfsp;
        vn_setops(vp, fd_vnodeops);
        vp->v_data = NULL;
        vp->v_flag = VNOMAP;
        vp->v_rdev = makedevice(fdrmaj, n);
        vn_exists(vp);
        *vpp = vp;
        return (0);
}

/*
 * fdfs is mounted on /dev/fd, however, there are two interesting
 * possibilities - two threads racing to do the same mount (protected
 * by vfs locking), and two threads mounting fdfs in different places.
 */
/*ARGSUSED*/
static int
fdmount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
{
        struct vnode *vp;

        if (secpolicy_fs_mount(cr, mvp, vfsp) != 0)
                return (EPERM);
        if (mvp->v_type != VDIR)
                return (ENOTDIR);

        mutex_enter(&mvp->v_lock);
        if ((uap->flags & MS_OVERLAY) == 0 &&
            (mvp->v_count > 1 || (mvp->v_flag & VROOT))) {
                mutex_exit(&mvp->v_lock);
                return (EBUSY);
        }
        mutex_exit(&mvp->v_lock);

        /*
         * Having the resource be anything but "fd" doesn't make sense
         */
        vfs_setresource(vfsp, "fd", 0);

        vp = vn_alloc(KM_SLEEP);
        vp->v_vfsp = vfsp;
        vn_setops(vp, fd_vnodeops);
        vp->v_type = VDIR;
        vp->v_data = NULL;
        vp->v_flag |= VROOT;
        vfsp->vfs_fstype = fdfstype;
        vfsp->vfs_data = (char *)vp;
        mutex_enter(&fd_minor_lock);
        do {
                fdfsmin = (fdfsmin + 1) & L_MAXMIN32;
                vfsp->vfs_dev = makedevice(fdfsmaj, fdfsmin);
        } while (vfs_devismounted(vfsp->vfs_dev));
        mutex_exit(&fd_minor_lock);
        vfs_make_fsid(&vfsp->vfs_fsid, vfsp->vfs_dev, fdfstype);
        vfsp->vfs_bsize = 1024;
        return (0);
}

/* ARGSUSED */
static int
fdunmount(vfs_t *vfsp, int flag, cred_t *cr)
{
        vnode_t *rvp;

        if (secpolicy_fs_unmount(cr, vfsp) != 0)
                return (EPERM);

        /*
         * forced unmount is not supported by this file system
         * and thus, ENOTSUP, is being returned.
         */
        if (flag & MS_FORCE)
                return (ENOTSUP);

        rvp = (vnode_t *)vfsp->vfs_data;
        if (rvp->v_count > 1)
                return (EBUSY);

        VN_RELE(rvp);
        return (0);
}

/* ARGSUSED */
static int
fdroot(vfs_t *vfsp, vnode_t **vpp)
{
        vnode_t *vp = (vnode_t *)vfsp->vfs_data;

        VN_HOLD(vp);
        *vpp = vp;
        return (0);
}

/*
 * No locking required because I held the root vnode before calling this
 * function so the vfs won't disappear on me.  To be more explicit:
 * fdvrootp->v_count will be greater than 1 so fdunmount will just return.
 */
static int
fdstatvfs(struct vfs *vfsp, struct statvfs64 *sp)
{
        dev32_t d32;
        rctl_qty_t fdno_ctl;

        mutex_enter(&curproc->p_lock);
        fdno_ctl = rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE],
            curproc->p_rctls, curproc);
        mutex_exit(&curproc->p_lock);

        bzero(sp, sizeof (*sp));
        sp->f_bsize = 1024;
        sp->f_frsize = 1024;
        sp->f_blocks = (fsblkcnt64_t)0;
        sp->f_bfree = (fsblkcnt64_t)0;
        sp->f_bavail = (fsblkcnt64_t)0;
        sp->f_files = (fsfilcnt64_t)
            (MIN(P_FINFO(curproc)->fi_nfiles, fdno_ctl + 2));
        sp->f_ffree = (fsfilcnt64_t)0;
        sp->f_favail = (fsfilcnt64_t)0;
        (void) cmpldev(&d32, vfsp->vfs_dev);
        sp->f_fsid = d32;
        (void) strcpy(sp->f_basetype, vfssw[fdfstype].vsw_name);
        sp->f_flag = vf_to_stf(vfsp->vfs_flag);
        sp->f_namemax = FDNSIZE;
        (void) strcpy(sp->f_fstr, "/dev/fd");
        (void) strcpy(&sp->f_fstr[8], "/dev/fd");
        return (0);
}

int
fdinit(int fstype, char *name)
{
        static const fs_operation_def_t fd_vfsops_template[] = {
                VFSNAME_MOUNT,          { .vfs_mount = fdmount },
                VFSNAME_UNMOUNT,        { .vfs_unmount = fdunmount },
                VFSNAME_ROOT,           { .vfs_root = fdroot },
                VFSNAME_STATVFS,        { .vfs_statvfs = fdstatvfs },
                NULL,                   NULL
        };
        int error;

        fdfstype = fstype;
        ASSERT(fdfstype != 0);

        /*
         * Associate VFS ops vector with this fstype.
         */
        error = vfs_setfsops(fstype, fd_vfsops_template, NULL);
        if (error != 0) {
                cmn_err(CE_WARN, "fdinit: bad vnode ops template");
                return (error);
        }

        error = vn_make_ops(name, fd_vnodeops_template, &fd_vnodeops);
        if (error != 0) {
                (void) vfs_freevfsops_by_type(fstype);
                cmn_err(CE_WARN, "fdinit: bad vnode ops template");
                return (error);
        }

        /*
         * Assign unique "device" numbers (reported by stat(2)).
         */
        fdfsmaj = getudev();
        fdrmaj = getudev();
        if (fdfsmaj == (major_t)-1 || fdrmaj == (major_t)-1) {
                cmn_err(CE_WARN, "fdinit: can't get unique device numbers");
                if (fdfsmaj == (major_t)-1)
                        fdfsmaj = 0;
                if (fdrmaj == (major_t)-1)
                        fdrmaj = 0;
        }
        mutex_init(&fd_minor_lock, NULL, MUTEX_DEFAULT, NULL);
        return (0);
}

/*
 * FDFS Mount options table
 */
static char *rw_cancel[] = { MNTOPT_RO, NULL };

static mntopt_t mntopts[] = {
/*
 *      option name             cancel option   default arg     flags
 */
        { MNTOPT_RW,            rw_cancel,      NULL,           MO_DEFAULT,
                (void *)MNTOPT_NOINTR },
        { MNTOPT_IGNORE,        NULL,           NULL,           0,
                (void *)0 },
};

static mntopts_t fdfs_mntopts = {
        sizeof (mntopts) / sizeof (mntopt_t),
        mntopts
};

static vfsdef_t vfw = {
        VFSDEF_VERSION,
        "fd",
        fdinit,
        VSW_HASPROTO | VSW_ZMOUNT,
        &fdfs_mntopts
};

static struct modlfs modlfs = {
        &mod_fsops,
        "filesystem for fd",
        &vfw
};

static struct modlinkage modlinkage = {
        MODREV_1,
        &modlfs,
        NULL
};

int
_init(void)
{
        return (mod_install(&modlinkage));
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&modlinkage, modinfop));
}