root/sys/fs/nullfs/null_vfsops.c
/*-
 * SPDX-License-Identifier: BSD-3-Clause
 *
 * Copyright (c) 1992, 1993, 1995
 *      The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software donated to Berkeley by
 * Jan-Simon Pendry.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * Null Layer
 * (See null_vnops.c for a description of what this does.)
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/proc.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/jail.h>

#include <fs/nullfs/null.h>

static MALLOC_DEFINE(M_NULLFSMNT, "nullfs_mount", "NULLFS mount structure");

static vfs_fhtovp_t     nullfs_fhtovp;
static vfs_mount_t      nullfs_mount;
static vfs_quotactl_t   nullfs_quotactl;
static vfs_root_t       nullfs_root;
static vfs_sync_t       nullfs_sync;
static vfs_statfs_t     nullfs_statfs;
static vfs_unmount_t    nullfs_unmount;
static vfs_vget_t       nullfs_vget;
static vfs_extattrctl_t nullfs_extattrctl;

SYSCTL_NODE(_vfs, OID_AUTO, nullfs, CTLFLAG_RW, 0, "nullfs");

static bool null_cache_vnodes = true;
SYSCTL_BOOL(_vfs_nullfs, OID_AUTO, cache_vnodes, CTLFLAG_RWTUN,
    &null_cache_vnodes, 0,
    "cache free nullfs vnodes");

/*
 * Mount null layer
 */
static int
nullfs_mount(struct mount *mp)
{
        struct vnode *lowerrootvp;
        struct vnode *nullm_rootvp;
        struct null_mount *xmp;
        struct null_node *nn;
        struct nameidata nd, *ndp;
        char *target;
        int error, len;
        bool isvnunlocked;
        static const char cache_opt_name[] = "cache";
        static const char nocache_opt_name[] = "nocache";
        static const char unixbypass_opt_name[] = "unixbypass";
        static const char nounixbypass_opt_name[] = "nounixbypass";

        NULLFSDEBUG("nullfs_mount(mp = %p)\n", (void *)mp);

        if (mp->mnt_flag & MNT_ROOTFS)
                return (EOPNOTSUPP);

        /*
         * Update is a no-op
         */
        if (mp->mnt_flag & MNT_UPDATE) {
                /*
                 * Only support update mounts for NFS export.
                 */
                if (vfs_flagopt(mp->mnt_optnew, "export", NULL, 0))
                        return (0);
                else
                        return (EOPNOTSUPP);
        }

        /*
         * Get argument
         */
        error = vfs_getopt(mp->mnt_optnew, "from", (void **)&target, &len);
        if (error != 0)
                error = vfs_getopt(mp->mnt_optnew, "target", (void **)&target, &len);
        if (error || target[len - 1] != '\0')
                return (EINVAL);

        /*
         * Unlock lower node to avoid possible deadlock.
         */
        if (null_is_nullfs_vnode(mp->mnt_vnodecovered) &&
            VOP_ISLOCKED(mp->mnt_vnodecovered) == LK_EXCLUSIVE) {
                VOP_UNLOCK(mp->mnt_vnodecovered);
                isvnunlocked = true;
        } else {
                isvnunlocked = false;
        }

        /*
         * Find lower node
         */
        ndp = &nd;
        NDINIT(ndp, LOOKUP, FOLLOW|LOCKLEAF, UIO_SYSSPACE, target);
        error = namei(ndp);

        /*
         * Re-lock vnode.
         * XXXKIB This is deadlock-prone as well.
         */
        if (isvnunlocked)
                vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY);

        if (error)
                return (error);
        NDFREE_PNBUF(ndp);

        /*
         * Sanity check on lower vnode
         */
        lowerrootvp = ndp->ni_vp;

        /*
         * Check multi null mount to avoid `lock against myself' panic.
         */
        if (null_is_nullfs_vnode(mp->mnt_vnodecovered)) {
                nn = VTONULL(mp->mnt_vnodecovered);
                if (nn == NULL || lowerrootvp == nn->null_lowervp) {
                        NULLFSDEBUG("nullfs_mount: multi null mount?\n");
                        vput(lowerrootvp);
                        return (EDEADLK);
                }
        }

        /*
         * Lower vnode must be the same type as the covered vnode - we
         * don't allow mounting directories to files or vice versa.
         */
        if ((lowerrootvp->v_type != VDIR && lowerrootvp->v_type != VREG) ||
            lowerrootvp->v_type != mp->mnt_vnodecovered->v_type) {
                NULLFSDEBUG("nullfs_mount: target must be same type as fspath");
                vput(lowerrootvp);
                return (EINVAL);
        }

        xmp = malloc(sizeof(struct null_mount), M_NULLFSMNT,
            M_WAITOK | M_ZERO);

        /*
         * Save pointer to underlying FS and the reference to the
         * lower root vnode.
         */
        xmp->nullm_vfs = vfs_register_upper_from_vp(lowerrootvp, mp,
            &xmp->upper_node);
        if (xmp->nullm_vfs == NULL) {
                vput(lowerrootvp);
                free(xmp, M_NULLFSMNT);
                return (ENOENT);
        }
        vref(lowerrootvp);
        xmp->nullm_lowerrootvp = lowerrootvp;
        mp->mnt_data = xmp;

        /*
         * Make sure the node alias worked.
         */
        error = null_nodeget(mp, lowerrootvp, &nullm_rootvp);
        if (error != 0) {
                vfs_unregister_upper(xmp->nullm_vfs, &xmp->upper_node);
                vrele(lowerrootvp);
                free(xmp, M_NULLFSMNT);
                return (error);
        }

        if (NULLVPTOLOWERVP(nullm_rootvp)->v_mount->mnt_flag & MNT_LOCAL) {
                MNT_ILOCK(mp);
                mp->mnt_flag |= MNT_LOCAL;
                MNT_IUNLOCK(mp);
        }

        if (vfs_getopt(mp->mnt_optnew, cache_opt_name, NULL, NULL) == 0) {
                xmp->nullm_flags |= NULLM_CACHE;
        } else if (vfs_getopt(mp->mnt_optnew, nocache_opt_name, NULL,
            NULL) == 0) {
                ;
        } else if (null_cache_vnodes &&
            (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) == 0) {
                xmp->nullm_flags |= NULLM_CACHE;
        }

        if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
                vfs_register_for_notification(xmp->nullm_vfs, mp,
                    &xmp->notify_node);
        }

        if (vfs_getopt(mp->mnt_optnew, unixbypass_opt_name, NULL, NULL) == 0) {
                ;
        } else if (vfs_getopt(mp->mnt_optnew, nounixbypass_opt_name, NULL,
            NULL) == 0) {
                xmp->nullm_flags |= NULLM_NOUNPBYPASS;
        }

        if (lowerrootvp == mp->mnt_vnodecovered) {
                vn_lock(lowerrootvp, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE);
                lowerrootvp->v_vflag |= VV_CROSSLOCK;
                VOP_UNLOCK(lowerrootvp);
        }

        MNT_ILOCK(mp);
        if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
                mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
                    (MNTK_SHARED_WRITES | MNTK_LOOKUP_SHARED |
                    MNTK_EXTENDED_SHARED);
        }
        mp->mnt_kern_flag |= MNTK_NOMSYNC | MNTK_UNLOCKED_INSMNTQUE;
        mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
            (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS);
        MNT_IUNLOCK(mp);
        vfs_getnewfsid(mp);
        vfs_mountedfrom(mp, target);
        vput(nullm_rootvp);

        NULLFSDEBUG("nullfs_mount: lower %s, alias at %s\n",
                mp->mnt_stat.f_mntfromname, mp->mnt_stat.f_mntonname);
        return (0);
}

/*
 * Free reference to null layer
 */
static int
nullfs_unmount(struct mount *mp, int mntflags)
{
        struct null_mount *mntdata;
        int error, flags;

        NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);

        if (mntflags & MNT_FORCE)
                flags = FORCECLOSE;
        else
                flags = 0;

        for (;;) {
                /* There is 1 extra root vnode reference (nullm_rootvp). */
                error = vflush(mp, 0, flags, curthread);
                if (error)
                        return (error);
                MNT_ILOCK(mp);
                if (mp->mnt_nvnodelistsize == 0) {
                        MNT_IUNLOCK(mp);
                        break;
                }
                MNT_IUNLOCK(mp);
                if ((mntflags & MNT_FORCE) == 0)
                        return (EBUSY);
        }

        /*
         * Finally, throw away the null_mount structure
         */
        mntdata = mp->mnt_data;
        if ((mntdata->nullm_flags & NULLM_CACHE) != 0) {
                vfs_unregister_for_notification(mntdata->nullm_vfs,
                    &mntdata->notify_node);
        }
        if (mntdata->nullm_lowerrootvp == mp->mnt_vnodecovered) {
                vn_lock(mp->mnt_vnodecovered, LK_EXCLUSIVE | LK_RETRY | LK_CANRECURSE);
                mp->mnt_vnodecovered->v_vflag &= ~VV_CROSSLOCK;
                VOP_UNLOCK(mp->mnt_vnodecovered);
        }
        vfs_unregister_upper(mntdata->nullm_vfs, &mntdata->upper_node);
        vrele(mntdata->nullm_lowerrootvp);
        mp->mnt_data = NULL;
        free(mntdata, M_NULLFSMNT);
        return (0);
}

static int
nullfs_root(struct mount *mp, int flags, struct vnode **vpp)
{
        struct vnode *vp;
        struct null_mount *mntdata;
        int error;

        mntdata = MOUNTTONULLMOUNT(mp);
        NULLFSDEBUG("nullfs_root(mp = %p, vp = %p)\n", mp,
            mntdata->nullm_lowerrootvp);

        error = vget(mntdata->nullm_lowerrootvp, flags);
        if (error == 0) {
                error = null_nodeget(mp, mntdata->nullm_lowerrootvp, &vp);
                if (error == 0) {
                        *vpp = vp;
                }
        }
        return (error);
}

static int
nullfs_quotactl(struct mount *mp, int cmd, uid_t uid, void *arg, bool *mp_busy)
{
        struct mount *lowermp;
        struct null_mount *mntdata;
        int error;
        bool unbusy;

        mntdata = MOUNTTONULLMOUNT(mp);
        lowermp = atomic_load_ptr(&mntdata->nullm_vfs);
        KASSERT(*mp_busy == true, ("upper mount not busy"));
        /*
         * See comment in sys_quotactl() for an explanation of why the
         * lower mount needs to be busied by the caller of VFS_QUOTACTL()
         * but may be unbusied by the implementation.  We must unbusy
         * the upper mount for the same reason; otherwise a namei lookup
         * issued by the VFS_QUOTACTL() implementation could traverse the
         * upper mount and deadlock.
         */
        vfs_unbusy(mp);
        *mp_busy = false;
        unbusy = true;
        error = vfs_busy(lowermp, 0);
        if (error == 0)
                error = VFS_QUOTACTL(lowermp, cmd, uid, arg, &unbusy);
        if (unbusy)
                vfs_unbusy(lowermp);

        return (error);
}

static int
nullfs_statfs(struct mount *mp, struct statfs *sbp)
{
        int error;
        struct statfs *mstat;

        NULLFSDEBUG("nullfs_statfs(mp = %p, vp = %p->%p)\n", (void *)mp,
            (void *)MOUNTTONULLMOUNT(mp)->nullm_rootvp,
            (void *)NULLVPTOLOWERVP(MOUNTTONULLMOUNT(mp)->nullm_rootvp));

        mstat = malloc(sizeof(struct statfs), M_STATFS, M_WAITOK | M_ZERO);

        error = VFS_STATFS(MOUNTTONULLMOUNT(mp)->nullm_vfs, mstat);
        if (error) {
                free(mstat, M_STATFS);
                return (error);
        }

        sbp->f_type = mstat->f_type;
        sbp->f_bsize = mstat->f_bsize;
        sbp->f_iosize = mstat->f_iosize;
        sbp->f_blocks = mstat->f_blocks;
        sbp->f_bfree = mstat->f_bfree;
        sbp->f_bavail = mstat->f_bavail;
        sbp->f_files = mstat->f_files;
        sbp->f_ffree = mstat->f_ffree;

        free(mstat, M_STATFS);
        return (0);
}

static int
nullfs_sync(struct mount *mp, int waitfor)
{
        /*
         * XXX - Assumes no data cached at null layer.
         */
        return (0);
}

static int
nullfs_vget(struct mount *mp, ino_t ino, int flags, struct vnode **vpp)
{
        int error;

        KASSERT((flags & LK_TYPE_MASK) != 0,
            ("nullfs_vget: no lock requested"));

        error = VFS_VGET(MOUNTTONULLMOUNT(mp)->nullm_vfs, ino, flags, vpp);
        if (error != 0)
                return (error);
        return (null_nodeget(mp, *vpp, vpp));
}

static int
nullfs_fhtovp(struct mount *mp, struct fid *fidp, int flags, struct vnode **vpp)
{
        int error;

        error = VFS_FHTOVP(MOUNTTONULLMOUNT(mp)->nullm_vfs, fidp, flags,
            vpp);
        if (error != 0)
                return (error);
        return (null_nodeget(mp, *vpp, vpp));
}

static int
nullfs_extattrctl(struct mount *mp, int cmd, struct vnode *filename_vp,
    int namespace, const char *attrname)
{

        return (VFS_EXTATTRCTL(MOUNTTONULLMOUNT(mp)->nullm_vfs, cmd,
            filename_vp, namespace, attrname));
}

static void
nullfs_reclaim_lowervp(struct mount *mp, struct vnode *lowervp)
{
        struct vnode *vp;

        vp = null_hashget(mp, lowervp);
        if (vp == NULL)
                return;
        VTONULL(vp)->null_flags |= NULLV_NOUNLOCK;
        vgone(vp);
        vput(vp);
}

static void
nullfs_unlink_lowervp(struct mount *mp, struct vnode *lowervp)
{
        struct vnode *vp;
        struct null_node *xp;

        vp = null_hashget(mp, lowervp);
        if (vp == NULL)
                return;
        xp = VTONULL(vp);
        xp->null_flags |= NULLV_DROP | NULLV_NOUNLOCK;
        vhold(vp);
        vunref(vp);

        if (VN_IS_DOOMED(vp)) {
                /*
                 * If the vnode is doomed, its lock was split from the lower
                 * vnode lock.  Therefore we need to do an extra unlock before
                 * allowing the final vdrop() to free the vnode.
                 */
                VOP_UNLOCK(vp);
        } else {
                /*
                 * Otherwise, the nullfs vnode still shares the lock
                 * with the lower vnode, and must not be unlocked.
                 * Also clear the NULLV_NOUNLOCK, the flag is not
                 * relevant for future reclamations.
                 */
                ASSERT_VOP_ELOCKED(vp, "unlink_lowervp");
                xp->null_flags &= ~NULLV_NOUNLOCK;
        }
        vdrop(vp);
}

static struct vfsops null_vfsops = {
        .vfs_extattrctl =       nullfs_extattrctl,
        .vfs_fhtovp =           nullfs_fhtovp,
        .vfs_init =             nullfs_init,
        .vfs_mount =            nullfs_mount,
        .vfs_quotactl =         nullfs_quotactl,
        .vfs_root =             nullfs_root,
        .vfs_statfs =           nullfs_statfs,
        .vfs_sync =             nullfs_sync,
        .vfs_uninit =           nullfs_uninit,
        .vfs_unmount =          nullfs_unmount,
        .vfs_vget =             nullfs_vget,
        .vfs_reclaim_lowervp =  nullfs_reclaim_lowervp,
        .vfs_unlink_lowervp =   nullfs_unlink_lowervp,
};

VFS_SET(null_vfsops, nullfs, VFCF_LOOPBACK | VFCF_JAIL | VFCF_FILEMOUNT);