root/usr/src/uts/common/fs/dev/sdev_zvolops.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2013, 2016 Joyent, Inc.  All rights reserved.
 * Copyright (c) 2014 by Delphix. All rights reserved.
 */

/* vnode ops for the /dev/zvol directory */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/ddi.h>
#include <sys/sunndi.h>
#include <sys/sunldi.h>
#include <fs/fs_subr.h>
#include <sys/fs/dv_node.h>
#include <sys/fs/sdev_impl.h>
#include <sys/zfs_ioctl.h>
#include <sys/policy.h>
#include <sys/stat.h>
#include <sys/vfs_opreg.h>

struct vnodeops *devzvol_vnodeops;
static major_t devzvol_major;
static taskq_ent_t devzvol_zclist_task;

static kmutex_t devzvol_mtx;
/* Below are protected by devzvol_mtx */
static boolean_t devzvol_isopen;
static boolean_t devzvol_zclist_task_running = B_FALSE;
static uint64_t devzvol_gen = 0;
static uint64_t devzvol_zclist;
static size_t devzvol_zclist_size;
static ldi_ident_t devzvol_li;
static ldi_handle_t devzvol_lh;

/*
 * we need to use ddi_mod* since fs/dev gets loaded early on in
 * startup(), and linking fs/dev to fs/zfs would drag in a lot of
 * other stuff (like drv/random) before the rest of the system is
 * ready to go
 */
ddi_modhandle_t zfs_mod;
int (*szcm)(char *);
int (*szn2m)(char *, minor_t *);


/*
 * Enable/disable snapshots from being created in /dev/zvol. By default,
 * they are enabled, preserving the historic behavior.
 */
boolean_t devzvol_snaps_allowed = B_TRUE;

int
sdev_zvol_create_minor(char *dsname)
{
        if (szcm == NULL)
                return (-1);
        return ((*szcm)(dsname));
}

int
sdev_zvol_name2minor(char *dsname, minor_t *minor)
{
        if (szn2m == NULL)
                return (-1);
        return ((*szn2m)(dsname, minor));
}

int
devzvol_open_zfs()
{
        int rc;
        dev_t dv;

        devzvol_li = ldi_ident_from_anon();
        if (ldi_open_by_name("/dev/zfs", FREAD | FWRITE, kcred,
            &devzvol_lh, devzvol_li))
                return (-1);
        if (zfs_mod == NULL && ((zfs_mod = ddi_modopen("fs/zfs",
            KRTLD_MODE_FIRST, &rc)) == NULL)) {
                return (rc);
        }
        ASSERT(szcm == NULL && szn2m == NULL);
        if ((szcm = (int (*)(char *))
            ddi_modsym(zfs_mod, "zvol_create_minor", &rc)) == NULL) {
                cmn_err(CE_WARN, "couldn't resolve zvol_create_minor");
                return (rc);
        }
        if ((szn2m = (int(*)(char *, minor_t *))
            ddi_modsym(zfs_mod, "zvol_name2minor", &rc)) == NULL) {
                cmn_err(CE_WARN, "couldn't resolve zvol_name2minor");
                return (rc);
        }
        if (ldi_get_dev(devzvol_lh, &dv))
                return (-1);
        devzvol_major = getmajor(dv);
        return (0);
}

void
devzvol_close_zfs()
{
        szcm = NULL;
        szn2m = NULL;
        (void) ldi_close(devzvol_lh, FREAD|FWRITE, kcred);
        ldi_ident_release(devzvol_li);
        if (zfs_mod != NULL) {
                (void) ddi_modclose(zfs_mod);
                zfs_mod = NULL;
        }
}

int
devzvol_handle_ioctl(int cmd, zfs_cmd_t *zc, size_t *alloc_size)
{
        uint64_t cookie;
        int size = 8000;
        int unused;
        int rc;

        if (cmd != ZFS_IOC_POOL_CONFIGS)
                mutex_enter(&devzvol_mtx);
        if (!devzvol_isopen) {
                if ((rc = devzvol_open_zfs()) == 0) {
                        devzvol_isopen = B_TRUE;
                } else {
                        if (cmd != ZFS_IOC_POOL_CONFIGS)
                                mutex_exit(&devzvol_mtx);
                        return (ENXIO);
                }
        }
        cookie = zc->zc_cookie;
again:
        zc->zc_nvlist_dst = (uint64_t)(intptr_t)kmem_alloc(size,
            KM_SLEEP);
        zc->zc_nvlist_dst_size = size;
        rc = ldi_ioctl(devzvol_lh, cmd, (intptr_t)zc, FKIOCTL, kcred,
            &unused);
        if (rc == ENOMEM) {
                int newsize;
                newsize = zc->zc_nvlist_dst_size;
                ASSERT(newsize > size);
                kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
                size = newsize;
                zc->zc_cookie = cookie;
                goto again;
        }
        if (alloc_size == NULL)
                kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst, size);
        else
                *alloc_size = size;
        if (cmd != ZFS_IOC_POOL_CONFIGS)
                mutex_exit(&devzvol_mtx);
        return (rc);
}

/* figures out if the objset exists and returns its type */
int
devzvol_objset_check(char *dsname, dmu_objset_type_t *type)
{
        boolean_t       ispool, is_snapshot;
        zfs_cmd_t       *zc;
        int rc;
        nvlist_t        *nvl;
        size_t nvsz;

        ispool = (strchr(dsname, '/') == NULL);
        is_snapshot = (strchr(dsname, '@') != NULL);

        if (is_snapshot && !devzvol_snaps_allowed)
                return (ENOTSUP);

        zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
        (void) strlcpy(zc->zc_name, dsname, MAXPATHLEN);

        nvl = fnvlist_alloc();
        fnvlist_add_boolean_value(nvl, "cachedpropsonly", B_TRUE);
        zc->zc_nvlist_src = (uintptr_t)fnvlist_pack(nvl, &nvsz);
        zc->zc_nvlist_src_size = nvsz;
        fnvlist_free(nvl);

        rc = devzvol_handle_ioctl(ispool ? ZFS_IOC_POOL_STATS :
            ZFS_IOC_OBJSET_STATS, zc, NULL);
        if (type && rc == 0)
                *type = (ispool) ? DMU_OST_ZFS :
                    zc->zc_objset_stats.dds_type;
        fnvlist_pack_free((char *)(uintptr_t)zc->zc_nvlist_src, nvsz);
        kmem_free(zc, sizeof (zfs_cmd_t));
        return (rc);
}

/*
 * Returns what the zfs dataset name should be, given the /dev/zvol
 * path and an optional name (can be NULL).
 *
 * Note that if the name param is NULL, then path must be an
 * actual dataset's directory and not one of the top-level
 * /dev/zvol/{dsk,rdsk} dirs, as these do not correspond to a
 * specific dataset.
 */
char *
devzvol_make_dsname(const char *path, const char *name)
{
        char *dsname;
        const char *ptr;
        int dslen;

        if (strcmp(path, ZVOL_DIR) == 0)
                return (NULL);
        if (name && (strcmp(name, ".") == 0 || strcmp(name, "..") == 0))
                return (NULL);
        ptr = path + strlen(ZVOL_DIR);
        if (strncmp(ptr, "/dsk", 4) == 0)
                ptr += strlen("/dsk");
        else if (strncmp(ptr, "/rdsk", 5) == 0)
                ptr += strlen("/rdsk");
        else
                return (NULL);

        if (*ptr == '/')
                ptr++;
        else if (name == NULL)
                return (NULL);

        dslen = strlen(ptr);
        if (dslen)
                dslen++;                        /* plus null */
        if (name)
                dslen += strlen(name) + 1;      /* plus slash */
        dsname = kmem_zalloc(dslen, KM_SLEEP);
        if (*ptr) {
                (void) strlcpy(dsname, ptr, dslen);
                if (name)
                        (void) strlcat(dsname, "/", dslen);
        }
        if (name)
                (void) strlcat(dsname, name, dslen);
        return (dsname);
}

/*
 * check if the zvol's sdev_node is still valid, which means make
 * sure the zvol is still valid.  zvol minors aren't proactively
 * destroyed when the zvol is destroyed, so we use a validator to clean
 * these up (in other words, when such nodes are encountered during
 * subsequent lookup() and readdir() operations) so that only valid
 * nodes are returned.  The ordering between devname_lookup_func and
 * devzvol_validate is a little inefficient in the case of invalid
 * or stale nodes because devname_lookup_func calls
 * devzvol_create_{dir, link}, then the validator says it's invalid,
 * and then the node gets cleaned up.
 */
int
devzvol_validate(struct sdev_node *dv)
{
        vnode_t *vn = SDEVTOV(dv);
        dmu_objset_type_t do_type;
        char *dsname;
        char *nm = dv->sdev_name;
        int rc;

        sdcmn_err13(("validating ('%s' '%s')", dv->sdev_path, nm));
        /*
         * validate only READY nodes; if someone is sitting on the
         * directory of a dataset that just got destroyed we could
         * get a zombie node which we just skip.
         */
        if (dv->sdev_state != SDEV_READY) {
                sdcmn_err13(("skipping '%s'", nm));
                return (SDEV_VTOR_SKIP);
        }

        if ((strcmp(dv->sdev_path, ZVOL_DIR "/dsk") == 0) ||
            (strcmp(dv->sdev_path, ZVOL_DIR "/rdsk") == 0))
                return (SDEV_VTOR_VALID);
        dsname = devzvol_make_dsname(dv->sdev_path, NULL);
        if (dsname == NULL)
                return (SDEV_VTOR_INVALID);

        /*
         * Leave any nodes alone that have been explicitly created by
         * sdev profiles.
         */
        if (!(dv->sdev_flags & SDEV_GLOBAL) && dv->sdev_origin != NULL) {
                kmem_free(dsname, strlen(dsname) + 1);
                return (SDEV_VTOR_VALID);
        }

        rc = devzvol_objset_check(dsname, &do_type);
        sdcmn_err13(("  '%s' rc %d", dsname, rc));
        if (rc != 0) {
                sdev_node_t *parent = dv->sdev_dotdot;
                /*
                 * Explicitly passed-through zvols in our sdev profile can't
                 * be created as prof_* shadow nodes, because in the GZ they
                 * are symlinks, but in the NGZ they are actual device files.
                 *
                 * The objset_check will fail on these as they are outside
                 * any delegated dataset (zfs will not allow ioctl access to
                 * them from this zone). We still want them to work, though.
                 */
                if (!(parent->sdev_flags & SDEV_GLOBAL) &&
                    parent->sdev_origin != NULL &&
                    !(dv->sdev_flags & SDEV_GLOBAL) &&
                    (vn->v_type == VBLK || vn->v_type == VCHR) &&
                    prof_name_matched(nm, parent)) {
                        do_type = DMU_OST_ZVOL;
                } else {
                        kmem_free(dsname, strlen(dsname) + 1);
                        return (SDEV_VTOR_INVALID);
                }
        }

        sdcmn_err13(("  v_type %d do_type %d",
            vn->v_type, do_type));
        if ((vn->v_type == VLNK && do_type != DMU_OST_ZVOL) ||
            ((vn->v_type == VBLK || vn->v_type == VCHR) &&
            do_type != DMU_OST_ZVOL) ||
            (vn->v_type == VDIR && do_type == DMU_OST_ZVOL)) {
                kmem_free(dsname, strlen(dsname) + 1);
                return (SDEV_VTOR_STALE);
        }
        if (vn->v_type == VLNK) {
                char *ptr, *link;
                long val = 0;
                minor_t lminor, ominor;

                rc = sdev_getlink(vn, &link);
                ASSERT(rc == 0);

                ptr = strrchr(link, ':') + 1;
                rc = ddi_strtol(ptr, NULL, 10, &val);
                kmem_free(link, strlen(link) + 1);
                ASSERT(rc == 0 && val != 0);
                lminor = (minor_t)val;
                if (sdev_zvol_name2minor(dsname, &ominor) < 0 ||
                    ominor != lminor) {
                        kmem_free(dsname, strlen(dsname) + 1);
                        return (SDEV_VTOR_STALE);
                }
        }
        kmem_free(dsname, strlen(dsname) + 1);
        return (SDEV_VTOR_VALID);
}

/*
 * Taskq callback to update the devzvol_zclist.
 *
 * We need to defer this to the taskq to avoid it running with a user
 * context that might be associated with some non-global zone, and thus
 * not being able to list all of the pools on the entire system.
 */
/*ARGSUSED*/
static void
devzvol_update_zclist_cb(void *arg)
{
        zfs_cmd_t       *zc;
        int             rc;
        size_t          size;

        zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
        mutex_enter(&devzvol_mtx);
        zc->zc_cookie = devzvol_gen;

        rc = devzvol_handle_ioctl(ZFS_IOC_POOL_CONFIGS, zc, &size);
        switch (rc) {
                case 0:
                        /* new generation */
                        ASSERT(devzvol_gen != zc->zc_cookie);
                        devzvol_gen = zc->zc_cookie;
                        if (devzvol_zclist)
                                kmem_free((void *)(uintptr_t)devzvol_zclist,
                                    devzvol_zclist_size);
                        devzvol_zclist = zc->zc_nvlist_dst;
                        /* Keep the alloc'd size, not the nvlist size. */
                        devzvol_zclist_size = size;
                        break;
                default:
                        /*
                         * Either there was no change in pool configuration
                         * since we last asked (rc == EEXIST) or we got a
                         * catastrophic error.
                         *
                         * Give up memory and exit.
                         */
                        kmem_free((void *)(uintptr_t)zc->zc_nvlist_dst,
                            size);
                        break;
        }

        VERIFY(devzvol_zclist_task_running == B_TRUE);
        devzvol_zclist_task_running = B_FALSE;
        mutex_exit(&devzvol_mtx);

        kmem_free(zc, sizeof (zfs_cmd_t));
}

static void
devzvol_update_zclist(void)
{
        mutex_enter(&devzvol_mtx);
        if (devzvol_zclist_task_running == B_TRUE) {
                mutex_exit(&devzvol_mtx);
                goto wait;
        }

        devzvol_zclist_task_running = B_TRUE;

        taskq_dispatch_ent(sdev_taskq, devzvol_update_zclist_cb, NULL, 0,
            &devzvol_zclist_task);

        mutex_exit(&devzvol_mtx);

wait:
        taskq_wait(sdev_taskq);
}

/*
 * Creates sub-directories for each zpool as needed in response to a
 * readdir on one of the /dev/zvol/{dsk,rdsk} directories.
 */
void
devzvol_create_pool_dirs(struct vnode *dvp)
{
        nvlist_t *nv = NULL;
        nvpair_t *elem = NULL;
        int pools = 0;
        int rc;

        sdcmn_err13(("devzvol_create_pool_dirs"));

        devzvol_update_zclist();

        mutex_enter(&devzvol_mtx);

        rc = nvlist_unpack((char *)(uintptr_t)devzvol_zclist,
            devzvol_zclist_size, &nv, 0);
        if (rc) {
                ASSERT(rc == 0);
                kmem_free((void *)(uintptr_t)devzvol_zclist,
                    devzvol_zclist_size);
                devzvol_gen = 0;
                devzvol_zclist = 0;
                devzvol_zclist_size = 0;
                goto out;
        }
        mutex_exit(&devzvol_mtx);
        while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) {
                struct vnode *vp;
                ASSERT(dvp->v_count > 0);
                rc = VOP_LOOKUP(dvp, nvpair_name(elem), &vp, NULL, 0,
                    NULL, kcred, NULL, 0, NULL);
                /* should either work, or not be visible from a zone */
                ASSERT(rc == 0 || rc == ENOENT);
                if (rc == 0)
                        VN_RELE(vp);
                pools++;
        }
        nvlist_free(nv);
        mutex_enter(&devzvol_mtx);
        if (devzvol_isopen && pools == 0) {
                /* clean up so zfs can be unloaded */
                devzvol_close_zfs();
                devzvol_isopen = B_FALSE;
        }
out:
        mutex_exit(&devzvol_mtx);
}

/*ARGSUSED3*/
static int
devzvol_create_dir(struct sdev_node *ddv, char *nm, void **arg,
    cred_t *cred, void *whatever, char *whichever)
{
        timestruc_t now;
        struct vattr *vap = (struct vattr *)arg;

        sdcmn_err13(("create_dir (%s) (%s) '%s'", ddv->sdev_name,
            ddv->sdev_path, nm));
        ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR,
            strlen(ZVOL_DIR)) == 0);
        *vap = *sdev_getdefault_attr(VDIR);
        gethrestime(&now);
        vap->va_atime = now;
        vap->va_mtime = now;
        vap->va_ctime = now;
        return (0);
}

/*ARGSUSED3*/
static int
devzvol_create_link(struct sdev_node *ddv, char *nm,
    void **arg, cred_t *cred, void *whatever, char *whichever)
{
        minor_t minor;
        char *pathname = (char *)*arg;
        int rc;
        char *dsname;
        char *x;
        char str[MAXNAMELEN];
        sdcmn_err13(("create_link (%s) (%s) '%s'", ddv->sdev_name,
            ddv->sdev_path, nm));
        dsname = devzvol_make_dsname(ddv->sdev_path, nm);
        rc = sdev_zvol_create_minor(dsname);
        if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
            sdev_zvol_name2minor(dsname, &minor)) {
                sdcmn_err13(("devzvol_create_link %d", rc));
                kmem_free(dsname, strlen(dsname) + 1);
                return (-1);
        }
        kmem_free(dsname, strlen(dsname) + 1);

        /*
         * This is a valid zvol; create a symlink that points to the
         * minor which was created under /devices/pseudo/zfs@0
         */
        *pathname = '\0';
        for (x = ddv->sdev_path; x = strchr(x, '/'); x++)
                (void) strcat(pathname, "../");
        (void) snprintf(str, sizeof (str), ZVOL_PSEUDO_DEV "%u", minor);
        (void) strncat(pathname, str, MAXPATHLEN);
        if (strncmp(ddv->sdev_path, ZVOL_FULL_RDEV_DIR,
            strlen(ZVOL_FULL_RDEV_DIR)) == 0)
                (void) strcat(pathname, ",raw");
        return (0);
}

/* Clean zvol sdev_nodes that are no longer valid.  */
static void
devzvol_prunedir(struct sdev_node *ddv)
{
        struct sdev_node *dv;

        ASSERT(RW_READ_HELD(&ddv->sdev_contents));

        sdcmn_err13(("prunedir '%s'", ddv->sdev_name));
        ASSERT(strncmp(ddv->sdev_path, ZVOL_DIR, strlen(ZVOL_DIR)) == 0);
        if (rw_tryupgrade(&ddv->sdev_contents) == 0) {
                rw_exit(&ddv->sdev_contents);
                rw_enter(&ddv->sdev_contents, RW_WRITER);
        }

        dv = SDEV_FIRST_ENTRY(ddv);
        while (dv) {
                sdcmn_err13(("sdev_name '%s'", dv->sdev_name));

                switch (devzvol_validate(dv)) {
                case SDEV_VTOR_VALID:
                case SDEV_VTOR_SKIP:
                        dv = SDEV_NEXT_ENTRY(ddv, dv);
                        continue;
                case SDEV_VTOR_INVALID:
                        sdcmn_err7(("prunedir: destroy invalid "
                            "node: %s\n", dv->sdev_name));
                        break;
                }

                if ((SDEVTOV(dv)->v_type == VDIR) &&
                    (sdev_cleandir(dv, NULL, 0) != 0)) {
                        dv = SDEV_NEXT_ENTRY(ddv, dv);
                        continue;
                }
                SDEV_HOLD(dv);
                /* remove the cache node */
                sdev_cache_update(ddv, &dv, dv->sdev_name,
                    SDEV_CACHE_DELETE);
                SDEV_RELE(dv);
                dv = SDEV_FIRST_ENTRY(ddv);
        }
        rw_downgrade(&ddv->sdev_contents);
}

/*
 * This function is used to create a dir or dev inside a zone's /dev when the
 * zone has a zvol that is dynamically created within the zone (i.e. inside
 * of a delegated dataset.  Since there is no /devices tree within a zone,
 * we create the chr/blk devices directly inside the zone's /dev instead of
 * making symlinks.
 */
static int
devzvol_mk_ngz_node(struct sdev_node *parent, char *nm)
{
        struct vattr vattr;
        timestruc_t now;
        enum vtype expected_type = VDIR;
        dmu_objset_type_t do_type;
        struct sdev_node *dv = NULL;
        int res;
        char *dsname;

        bzero(&vattr, sizeof (vattr));
        gethrestime(&now);
        vattr.va_mask = AT_TYPE|AT_MODE|AT_UID|AT_GID;
        vattr.va_uid = SDEV_UID_DEFAULT;
        vattr.va_gid = SDEV_GID_DEFAULT;
        vattr.va_type = VNON;
        vattr.va_atime = now;
        vattr.va_mtime = now;
        vattr.va_ctime = now;

        if ((dsname = devzvol_make_dsname(parent->sdev_path, nm)) == NULL)
                return (ENOENT);

        if (devzvol_objset_check(dsname, &do_type) != 0) {
                /*
                 * objset_check will succeed on any valid objset in the global
                 * zone, and any valid delegated dataset. It will fail, however,
                 * in non-global zones on explicitly whitelisted zvol devices
                 * that are outside any delegated dataset.
                 *
                 * The directories leading up to the zvol device itself will be
                 * created by prof for us in advance (and will always validate
                 * because of the matching check in devzvol_validate). The zvol
                 * device itself can't be created by prof though because in the
                 * GZ it's a symlink, and in the NGZ it is not. So, we create
                 * such zvol device files here.
                 */
                if (!(parent->sdev_flags & SDEV_GLOBAL) &&
                    parent->sdev_origin != NULL &&
                    prof_name_matched(nm, parent)) {
                        do_type = DMU_OST_ZVOL;
                } else {
                        kmem_free(dsname, strlen(dsname) + 1);
                        return (ENOENT);
                }
        }

        if (do_type == DMU_OST_ZVOL)
                expected_type = VBLK;

        if (expected_type == VDIR) {
                vattr.va_type = VDIR;
                vattr.va_mode = SDEV_DIRMODE_DEFAULT;
        } else {
                minor_t minor;
                dev_t devnum;
                int rc;

                rc = sdev_zvol_create_minor(dsname);
                if ((rc != 0 && rc != EEXIST && rc != EBUSY) ||
                    sdev_zvol_name2minor(dsname, &minor)) {
                        kmem_free(dsname, strlen(dsname) + 1);
                        return (ENOENT);
                }

                devnum = makedevice(devzvol_major, minor);
                vattr.va_rdev = devnum;

                if (strstr(parent->sdev_path, "/rdsk/") != NULL)
                        vattr.va_type = VCHR;
                else
                        vattr.va_type = VBLK;
                vattr.va_mode = SDEV_DEVMODE_DEFAULT;
        }
        kmem_free(dsname, strlen(dsname) + 1);

        rw_enter(&parent->sdev_contents, RW_WRITER);

        res = sdev_mknode(parent, nm, &dv, &vattr,
            NULL, NULL, kcred, SDEV_READY);
        rw_exit(&parent->sdev_contents);
        if (res != 0)
                return (ENOENT);

        SDEV_RELE(dv);
        return (0);
}

/*ARGSUSED*/
static int
devzvol_lookup(struct vnode *dvp, char *nm, struct vnode **vpp,
    struct pathname *pnp, int flags, struct vnode *rdir, struct cred *cred,
    caller_context_t *ct, int *direntflags, pathname_t *realpnp)
{
        enum vtype expected_type = VDIR;
        struct sdev_node *parent = VTOSDEV(dvp);
        char *dsname;
        dmu_objset_type_t do_type;
        int error;

        sdcmn_err13(("devzvol_lookup '%s' '%s'", parent->sdev_path, nm));
        *vpp = NULL;
        /* execute access is required to search the directory */
        if ((error = VOP_ACCESS(dvp, VEXEC, 0, cred, ct)) != 0)
                return (error);

        rw_enter(&parent->sdev_contents, RW_READER);
        if (!SDEV_IS_GLOBAL(parent)) {
                int res;

                rw_exit(&parent->sdev_contents);

                /*
                 * If we're in the global zone and reach down into a non-global
                 * zone's /dev/zvol then this action could trigger the creation
                 * of all of the zvol devices for every zone into the non-global
                 * zone's /dev tree. This could be a big security hole. To
                 * prevent this, disallow the global zone from looking inside
                 * a non-global zones /dev/zvol. This behavior is similar to
                 * delegated datasets, which cannot be used by the global zone.
                 */
                if (getzoneid() == GLOBAL_ZONEID)
                        return (EPERM);

                res = prof_lookup(dvp, nm, vpp, cred);

                /*
                 * We won't find a zvol that was dynamically created inside
                 * a NGZ, within a delegated dataset, in the zone's dev profile
                 * but prof_lookup will also find it via sdev_cache_lookup.
                 */
                if (res == ENOENT) {
                        /*
                         * We have to create the sdev node for the dymamically
                         * created zvol.
                         */
                        if (devzvol_mk_ngz_node(parent, nm) != 0)
                                return (ENOENT);
                        res = prof_lookup(dvp, nm, vpp, cred);
                }

                return (res);
        }

        /*
         * Don't let the global-zone style lookup succeed here when we're not
         * running in the global zone. This can happen because prof calls into
         * us (in prof_filldir) trying to create an explicitly passed-through
         * zvol device outside any delegated dataset.
         *
         * We have to stop this here or else we will create prof shadows of
         * the global zone symlink, which will make no sense at all in the
         * non-global zone (it has no /devices for the symlink to point at).
         *
         * These zvols will be created later (at access time) by mk_ngz_node
         * instead. The dirs leading up to them will be created by prof
         * internally.
         *
         * We have to return EPERM here, because ENOENT is given special
         * meaning by prof in this context.
         */
        if (getzoneid() != GLOBAL_ZONEID) {
                rw_exit(&parent->sdev_contents);
                return (EPERM);
        }

        dsname = devzvol_make_dsname(parent->sdev_path, nm);
        rw_exit(&parent->sdev_contents);
        sdcmn_err13(("rvp dsname %s", dsname ? dsname : "(null)"));
        if (dsname) {
                error = devzvol_objset_check(dsname, &do_type);
                if (error != 0) {
                        error = ENOENT;
                        goto out;
                }
                if (do_type == DMU_OST_ZVOL)
                        expected_type = VLNK;
        }
        /*
         * the callbacks expect:
         *
         * parent->sdev_path               nm
         * /dev/zvol                       {r}dsk
         * /dev/zvol/{r}dsk                <pool name>
         * /dev/zvol/{r}dsk/<dataset name> <last ds component>
         *
         * sdev_name is always last path component of sdev_path
         */
        if (expected_type == VDIR) {
                error = devname_lookup_func(parent, nm, vpp, cred,
                    devzvol_create_dir, SDEV_VATTR);
        } else {
                error = devname_lookup_func(parent, nm, vpp, cred,
                    devzvol_create_link, SDEV_VLINK);
        }
        sdcmn_err13(("devzvol_lookup %d %d", expected_type, error));
        ASSERT(error || ((*vpp)->v_type == expected_type));
out:
        if (dsname)
                kmem_free(dsname, strlen(dsname) + 1);
        sdcmn_err13(("devzvol_lookup %d", error));
        return (error);
}

/*
 * We allow create to find existing nodes
 *      - if the node doesn't exist - EROFS
 *      - creating an existing dir read-only succeeds, otherwise EISDIR
 *      - exclusive creates fail - EEXIST
 */
/*ARGSUSED2*/
static int
devzvol_create(struct vnode *dvp, char *nm, struct vattr *vap, vcexcl_t excl,
    int mode, struct vnode **vpp, struct cred *cred, int flag,
    caller_context_t *ct, vsecattr_t *vsecp)
{
        int error;
        struct vnode *vp;

        *vpp = NULL;

        error = devzvol_lookup(dvp, nm, &vp, NULL, 0, NULL, cred, ct, NULL,
            NULL);
        if (error == 0) {
                if (excl == EXCL)
                        error = EEXIST;
                else if (vp->v_type == VDIR && (mode & VWRITE))
                        error = EISDIR;
                else
                        error = VOP_ACCESS(vp, mode, 0, cred, ct);

                if (error) {
                        VN_RELE(vp);
                } else
                        *vpp = vp;
        } else if (error == ENOENT) {
                error = EROFS;
        }

        return (error);
}

void sdev_iter_snapshots(struct vnode *dvp, char *name);

void
sdev_iter_datasets(struct vnode *dvp, int arg, char *name)
{
        zfs_cmd_t       *zc;
        int rc;

        sdcmn_err13(("iter name is '%s' (arg %x)", name, arg));
        zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
        (void) strcpy(zc->zc_name, name);

        while ((rc = devzvol_handle_ioctl(arg, zc, B_FALSE)) == 0) {
                struct vnode *vpp;
                char *ptr;

                sdcmn_err13(("  name %s", zc->zc_name));
                if (strchr(zc->zc_name, '$') || strchr(zc->zc_name, '%'))
                        goto skip;
                ptr = strrchr(zc->zc_name, '/') + 1;
                rc = devzvol_lookup(dvp, ptr, &vpp, NULL, 0, NULL,
                    kcred, NULL, NULL, NULL);
                if (rc == 0) {
                        VN_RELE(vpp);
                } else if (rc == ENOENT) {
                        goto skip;
                } else {
                        /*
                         * EBUSY == problem with zvols's dmu holds?
                         * EPERM when in a NGZ and traversing up and out.
                         */
                        goto skip;
                }
                if (arg == ZFS_IOC_DATASET_LIST_NEXT &&
                    zc->zc_objset_stats.dds_type == DMU_OST_ZVOL &&
                    devzvol_snaps_allowed)
                        sdev_iter_snapshots(dvp, zc->zc_name);
skip:
                (void) strcpy(zc->zc_name, name);
        }
        kmem_free(zc, sizeof (zfs_cmd_t));
}

void
sdev_iter_snapshots(struct vnode *dvp, char *name)
{
        sdev_iter_datasets(dvp, ZFS_IOC_SNAPSHOT_LIST_NEXT, name);
}

/*ARGSUSED4*/
static int
devzvol_readdir(struct vnode *dvp, struct uio *uiop, struct cred *cred,
    int *eofp, caller_context_t *ct_unused, int flags_unused)
{
        struct sdev_node *sdvp = VTOSDEV(dvp);
        char *ptr;

        sdcmn_err13(("zv readdir of '%s' %s'", sdvp->sdev_path,
            sdvp->sdev_name));

        if (strcmp(sdvp->sdev_path, ZVOL_DIR) == 0) {
                struct vnode *vp;

                rw_exit(&sdvp->sdev_contents);
                (void) devname_lookup_func(sdvp, "dsk", &vp, cred,
                    devzvol_create_dir, SDEV_VATTR);
                VN_RELE(vp);
                (void) devname_lookup_func(sdvp, "rdsk", &vp, cred,
                    devzvol_create_dir, SDEV_VATTR);
                VN_RELE(vp);
                rw_enter(&sdvp->sdev_contents, RW_READER);
                return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
        }
        if (uiop->uio_offset == 0)
                devzvol_prunedir(sdvp);
        ptr = sdvp->sdev_path + strlen(ZVOL_DIR);
        if ((strcmp(ptr, "/dsk") == 0) || (strcmp(ptr, "/rdsk") == 0)) {
                rw_exit(&sdvp->sdev_contents);
                devzvol_create_pool_dirs(dvp);
                rw_enter(&sdvp->sdev_contents, RW_READER);
                return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
        }

        ptr = strchr(ptr + 1, '/');
        if (ptr == NULL)
                return (ENOENT);
        ptr++;
        rw_exit(&sdvp->sdev_contents);
        sdev_iter_datasets(dvp, ZFS_IOC_DATASET_LIST_NEXT, ptr);
        rw_enter(&sdvp->sdev_contents, RW_READER);
        return (devname_readdir_func(dvp, uiop, cred, eofp, 0));
}

const fs_operation_def_t devzvol_vnodeops_tbl[] = {
        VOPNAME_READDIR,        { .vop_readdir = devzvol_readdir },
        VOPNAME_LOOKUP,         { .vop_lookup = devzvol_lookup },
        VOPNAME_CREATE,         { .vop_create = devzvol_create },
        VOPNAME_RENAME,         { .error = fs_nosys },
        VOPNAME_MKDIR,          { .error = fs_nosys },
        VOPNAME_RMDIR,          { .error = fs_nosys },
        VOPNAME_REMOVE,         { .error = fs_nosys },
        VOPNAME_SYMLINK,        { .error = fs_nosys },
        NULL,                   NULL
};