root/usr/src/uts/common/fs/tmpfs/tmp_dir.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/types.h>
#include <sys/param.h>
#include <sys/sysmacros.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/errno.h>
#include <sys/cmn_err.h>
#include <sys/cred.h>
#include <sys/stat.h>
#include <sys/debug.h>
#include <sys/policy.h>
#include <sys/fs/tmpnode.h>
#include <sys/fs/tmp.h>
#include <sys/vtrace.h>

static int tdircheckpath(struct tmpnode *, struct tmpnode *, struct cred *);
static int tdirrename(struct tmpnode *, struct tmpnode *, struct tmpnode *,
        char *, struct tmpnode *, struct tdirent *, struct cred *);
static void tdirfixdotdot(struct tmpnode *, struct tmpnode *, struct tmpnode *);
static int tdirmaketnode(struct tmpnode *, struct tmount *, struct vattr *,
        enum de_op, struct tmpnode **, struct cred *);
static int tdiraddentry(struct tmpnode *, struct tmpnode *, char *,
        enum de_op, struct tmpnode *);


#define T_HASH_SIZE     8192            /* must be power of 2 */
#define T_MUTEX_SIZE    64

/* Non-static so compilers won't constant-fold these away. */
clock_t tmpfs_rename_backoff_delay = 1;
unsigned int tmpfs_rename_backoff_tries = 0;
unsigned long tmpfs_rename_loops = 0;

static struct tdirent   *t_hashtable[T_HASH_SIZE];
static kmutex_t          t_hashmutex[T_MUTEX_SIZE];

#define T_HASH_INDEX(a)         ((a) & (T_HASH_SIZE-1))
#define T_MUTEX_INDEX(a)        ((a) & (T_MUTEX_SIZE-1))

#define TMPFS_HASH(tp, name, hash)                              \
        {                                                       \
                char Xc, *Xcp;                                  \
                hash = (uint_t)(uintptr_t)(tp) >> 8;            \
                for (Xcp = (name); (Xc = *Xcp) != 0; Xcp++)     \
                        hash = (hash << 4) + hash + (uint_t)Xc; \
        }

void
tmpfs_hash_init(void)
{
        int     ix;

        for (ix = 0; ix < T_MUTEX_SIZE; ix++)
                mutex_init(&t_hashmutex[ix], NULL, MUTEX_DEFAULT, NULL);
}

/*
 * This routine is where the rubber meets the road for identities.
 */
static void
tmpfs_hash_in(struct tdirent *t)
{
        uint_t          hash;
        struct tdirent  **prevpp;
        kmutex_t        *t_hmtx;

        TMPFS_HASH(t->td_parent, t->td_name, hash);
        t->td_hash = hash;
        prevpp = &t_hashtable[T_HASH_INDEX(hash)];
        t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
        mutex_enter(t_hmtx);
        t->td_link = *prevpp;
        *prevpp = t;
        mutex_exit(t_hmtx);
}

/*
 * Remove tdirent *t from the hash list.
 */
static void
tmpfs_hash_out(struct tdirent *t)
{
        uint_t          hash;
        struct tdirent  **prevpp;
        kmutex_t        *t_hmtx;

        hash = t->td_hash;
        prevpp = &t_hashtable[T_HASH_INDEX(hash)];
        t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
        mutex_enter(t_hmtx);
        while (*prevpp != t)
                prevpp = &(*prevpp)->td_link;
        *prevpp = t->td_link;
        mutex_exit(t_hmtx);
}

/*
 * Currently called by tdirrename() only.
 * rename operation needs to be done with lock held, to ensure that
 * no other operations can access the tmpnode at the same instance.
 */
static void
tmpfs_hash_change(struct tdirent *tdp, struct tmpnode *fromtp)
{
        uint_t          hash;
        kmutex_t        *t_hmtx;

        hash = tdp->td_hash;
        t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
        mutex_enter(t_hmtx);
        tdp->td_tmpnode = fromtp;
        mutex_exit(t_hmtx);
}

static struct tdirent *
tmpfs_hash_lookup(char *name, struct tmpnode *parent, uint_t hold,
        struct tmpnode **found)
{
        struct tdirent  *l;
        uint_t          hash;
        kmutex_t        *t_hmtx;
        struct tmpnode  *tnp;

        TMPFS_HASH(parent, name, hash);
        t_hmtx = &t_hashmutex[T_MUTEX_INDEX(hash)];
        mutex_enter(t_hmtx);
        l = t_hashtable[T_HASH_INDEX(hash)];
        while (l) {
                if ((l->td_hash == hash) &&
                    (l->td_parent == parent) &&
                    (strcmp(l->td_name, name) == 0)) {
                        /*
                         * We need to make sure that the tmpnode that
                         * we put a hold on is the same one that we pass back.
                         * Hence, temporary variable tnp is necessary.
                         */
                        tnp = l->td_tmpnode;
                        if (hold) {
                                ASSERT(tnp);
                                tmpnode_hold(tnp);
                        }
                        if (found)
                                *found = tnp;
                        mutex_exit(t_hmtx);
                        return (l);
                } else {
                        l = l->td_link;
                }
        }
        mutex_exit(t_hmtx);
        return (NULL);
}

/*
 * Search directory 'parent' for entry 'name'.
 *
 * The calling thread can't hold the write version
 * of the rwlock for the directory being searched
 *
 * 0 is returned on success and *foundtp points
 * to the found tmpnode with its vnode held.
 */
int
tdirlookup(
        struct tmpnode *parent,
        char *name,
        struct tmpnode **foundtp,
        struct cred *cred)
{
        int error;

        *foundtp = NULL;
        if (parent->tn_type != VDIR)
                return (ENOTDIR);

        if ((error = tmp_taccess(parent, VEXEC, cred)))
                return (error);

        if (*name == '\0') {
                tmpnode_hold(parent);
                *foundtp = parent;
                return (0);
        }

        /*
         * Search the directory for the matching name
         * We need the lock protecting the tn_dir list
         * so that it doesn't change out from underneath us.
         * tmpfs_hash_lookup() will pass back the tmpnode
         * with a hold on it.
         */

        if (tmpfs_hash_lookup(name, parent, 1, foundtp) != NULL) {
                ASSERT(*foundtp);
                return (0);
        }

        return (ENOENT);
}

/*
 * Enter a directory entry for 'name' and 'tp' into directory 'dir'
 *
 * Returns 0 on success.
 */
int
tdirenter(
        struct tmount   *tm,
        struct tmpnode  *dir,           /* target directory to make entry in */
        char            *name,          /* name of entry */
        enum de_op      op,             /* entry operation */
        struct tmpnode  *fromparent,    /* source directory if rename */
        struct tmpnode  *tp,            /* source tmpnode, if link/rename */
        struct vattr    *va,
        struct tmpnode  **tpp,          /* return tmpnode, if create/mkdir */
        struct cred     *cred,
        caller_context_t *ctp)
{
        struct tdirent *tdp;
        struct tmpnode *found = NULL;
        int error = 0;
        char *s;

        /*
         * tn_rwlock is held to serialize direnter and dirdeletes
         */
        ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
        ASSERT(dir->tn_type == VDIR);

        /*
         * Don't allow '/' characters in pathname component
         * (thus in ufs_direnter()).
         */
        for (s = name; *s; s++)
                if (*s == '/')
                        return (EACCES);

        if (name[0] == '\0')
                panic("tdirenter: NULL name");

        /*
         * For link and rename lock the source entry and check the link count
         * to see if it has been removed while it was unlocked.
         */
        if (op == DE_LINK || op == DE_RENAME) {
                if (tp != dir) {
                        unsigned int tries = 0;

                        /*
                         * If we are acquiring tp->tn_rwlock (for SOURCE)
                         * inside here, we must consider the following:
                         *
                         * - dir->tn_rwlock (TARGET) is already HELD (see
                         * above ASSERT()).
                         *
                         * - It is possible our SOURCE is a parent of our
                         * TARGET. Yes it's unusual, but it will return an
                         * error below via tdircheckpath().
                         *
                         * - It is also possible that another thread,
                         * concurrent to this one, is performing
                         * rmdir(TARGET), which means it will first acquire
                         * SOURCE's lock, THEN acquire TARGET's lock, which
                         * could result in this thread holding TARGET and
                         * trying for SOURCE, but the other thread holding
                         * SOURCE and trying for TARGET.  This is deadlock,
                         * and it's inducible.
                         *
                         * To prevent this, we borrow some techniques from UFS
                         * and rw_tryenter(), delaying if we fail, and
                         * if someone tweaks the number of backoff tries to be
                         * nonzero, return EBUSY after that number of tries.
                         */
                        while (!rw_tryenter(&tp->tn_rwlock, RW_WRITER)) {
                                /*
                                 * Sloppy, but this is a diagnostic so atomic
                                 * increment would be overkill.
                                 */
                                tmpfs_rename_loops++;

                                if (tmpfs_rename_backoff_tries != 0) {
                                        if (tries > tmpfs_rename_backoff_tries)
                                                return (EBUSY);
                                        tries++;
                                }
                                /*
                                 * NOTE: We're still holding dir->tn_rwlock,
                                 * so drop it over the delay, so any other
                                 * thread can get its business done.
                                 *
                                 * No state change or state inspection happens
                                 * prior to here, so it is not wholly dangerous
                                 * to release-and-reacquire dir->tn_rwlock.
                                 *
                                 * Hold the vnode of dir in case it gets
                                 * released by another thread, though.
                                 */
                                VN_HOLD(TNTOV(dir));
                                rw_exit(&dir->tn_rwlock);
                                delay(tmpfs_rename_backoff_delay);
                                rw_enter(&dir->tn_rwlock, RW_WRITER);
                                VN_RELE(TNTOV(dir));
                        }
                }
                mutex_enter(&tp->tn_tlock);
                if (tp->tn_nlink == 0) {
                        mutex_exit(&tp->tn_tlock);
                        if (tp != dir)
                                rw_exit(&tp->tn_rwlock);
                        return (ENOENT);
                }

                if (tp->tn_nlink == MAXLINK) {
                        mutex_exit(&tp->tn_tlock);
                        if (tp != dir)
                                rw_exit(&tp->tn_rwlock);
                        return (EMLINK);
                }
                tp->tn_nlink++;
                gethrestime(&tp->tn_ctime);
                mutex_exit(&tp->tn_tlock);
                if (tp != dir)
                        rw_exit(&tp->tn_rwlock);
        }

        /*
         * This might be a "dangling detached directory".
         * it could have been removed, but a reference
         * to it kept in u_cwd.  don't bother searching
         * it, and with any luck the user will get tired
         * of dealing with us and cd to some absolute
         * pathway.  *sigh*, thus in ufs, too.
         */
        if (dir->tn_nlink == 0) {
                error = ENOENT;
                goto out;
        }

        /*
         * If this is a rename of a directory and the parent is
         * different (".." must be changed), then the source
         * directory must not be in the directory hierarchy
         * above the target, as this would orphan everything
         * below the source directory.
         */
        if (op == DE_RENAME) {
                if (tp == dir) {
                        error = EINVAL;
                        goto out;
                }
                if (tp->tn_type == VDIR) {
                        if ((fromparent != dir) &&
                            (error = tdircheckpath(tp, dir, cred))) {
                                goto out;
                        }
                }
        }

        /*
         * Search for the entry.  Return "found" if it exists.
         */
        tdp = tmpfs_hash_lookup(name, dir, 1, &found);

        if (tdp) {
                ASSERT(found);
                switch (op) {
                case DE_CREATE:
                case DE_MKDIR:
                        if (tpp) {
                                *tpp = found;
                                error = EEXIST;
                        } else {
                                tmpnode_rele(found);
                        }
                        break;

                case DE_RENAME:
                        error = tdirrename(fromparent, tp,
                            dir, name, found, tdp, cred);
                        if (error == 0) {
                                if (found != NULL) {
                                        vnevent_rename_dest(TNTOV(found),
                                            TNTOV(dir), name, ctp);
                                }
                        }

                        tmpnode_rele(found);
                        break;

                case DE_LINK:
                        /*
                         * Can't link to an existing file.
                         */
                        error = EEXIST;
                        tmpnode_rele(found);
                        break;
                }
        } else {

                /*
                 * The entry does not exist. Check write permission in
                 * directory to see if entry can be created.
                 */
                if (error = tmp_taccess(dir, VWRITE, cred))
                        goto out;
                if (op == DE_CREATE || op == DE_MKDIR) {
                        /*
                         * Make new tmpnode and directory entry as required.
                         */
                        error = tdirmaketnode(dir, tm, va, op, &tp, cred);
                        if (error)
                                goto out;
                }
                if (error = tdiraddentry(dir, tp, name, op, fromparent)) {
                        if (op == DE_CREATE || op == DE_MKDIR) {
                                /*
                                 * Unmake the inode we just made.
                                 */
                                rw_enter(&tp->tn_rwlock, RW_WRITER);
                                if ((tp->tn_type) == VDIR) {
                                        ASSERT(tdp == NULL);
                                        /*
                                         * cleanup allocs made by tdirinit()
                                         */
                                        tdirtrunc(tp);
                                }
                                mutex_enter(&tp->tn_tlock);
                                tp->tn_nlink = 0;
                                mutex_exit(&tp->tn_tlock);
                                gethrestime(&tp->tn_ctime);
                                rw_exit(&tp->tn_rwlock);
                                tmpnode_rele(tp);
                                tp = NULL;
                        }
                } else if (tpp) {
                        *tpp = tp;
                } else if (op == DE_CREATE || op == DE_MKDIR) {
                        tmpnode_rele(tp);
                }
        }

out:
        if (error && (op == DE_LINK || op == DE_RENAME)) {
                /*
                 * Undo bumped link count.
                 */
                DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
                gethrestime(&tp->tn_ctime);
        }
        return (error);
}

/*
 * Delete entry tp of name "nm" from dir.
 * Free dir entry space and decrement link count on tmpnode(s).
 *
 * Return 0 on success.
 */
int
tdirdelete(
        struct tmpnode *dir,
        struct tmpnode *tp,
        char *nm,
        enum dr_op op,
        struct cred *cred)
{
        struct tdirent *tpdp;
        int error;
        size_t namelen;
        struct tmpnode *tnp;
        timestruc_t now;

        ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
        ASSERT(RW_WRITE_HELD(&tp->tn_rwlock));
        ASSERT(dir->tn_type == VDIR);

        if (nm[0] == '\0')
                panic("tdirdelete: NULL name for %p", (void *)tp);

        /*
         * return error when removing . and ..
         */
        if (nm[0] == '.') {
                if (nm[1] == '\0')
                        return (EINVAL);
                if (nm[1] == '.' && nm[2] == '\0')
                        return (EEXIST); /* thus in ufs */
        }

        if (error = tmp_taccess(dir, VEXEC|VWRITE, cred))
                return (error);

        /*
         * If the parent directory is "sticky", then the user must
         * own the parent directory or the file in it, or else must
         * have permission to write the file.  Otherwise it may not
         * be deleted (except by privileged users).
         * Same as ufs_dirremove.
         */
        if ((error = tmp_sticky_remove_access(dir, tp, cred)) != 0)
                return (error);

        if (dir->tn_dir == NULL)
                return (ENOENT);

        tpdp = tmpfs_hash_lookup(nm, dir, 0, &tnp);
        if (tpdp == NULL) {
                /*
                 * If it is gone, some other thread got here first!
                 * Return error ENOENT.
                 */
                return (ENOENT);
        }

        /*
         * If the tmpnode in the tdirent changed, we were probably
         * the victim of a concurrent rename operation.  The original
         * is gone, so return that status (same as UFS).
         */
        if (tp != tnp)
                return (ENOENT);

        tmpfs_hash_out(tpdp);

        /*
         * Take tpdp out of the directory list.
         */
        ASSERT(tpdp->td_next != tpdp);
        ASSERT(tpdp->td_prev != tpdp);
        if (tpdp->td_prev) {
                tpdp->td_prev->td_next = tpdp->td_next;
        }
        if (tpdp->td_next) {
                tpdp->td_next->td_prev = tpdp->td_prev;
        }

        /*
         * If the roving slot pointer happens to match tpdp,
         * point it at the previous dirent.
         */
        if (dir->tn_dir->td_prev == tpdp) {
                dir->tn_dir->td_prev = tpdp->td_prev;
        }
        ASSERT(tpdp->td_next != tpdp);
        ASSERT(tpdp->td_prev != tpdp);

        /*
         * tpdp points to the correct directory entry
         */
        namelen = strlen(tpdp->td_name) + 1;

        tmp_memfree(tpdp, sizeof (struct tdirent) + namelen);
        dir->tn_size -= (sizeof (struct tdirent) + namelen);
        dir->tn_dirents--;

        gethrestime(&now);
        dir->tn_mtime = now;
        dir->tn_ctime = now;
        tp->tn_ctime = now;

        ASSERT(tp->tn_nlink > 0);
        DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
        if (op == DR_RMDIR && tp->tn_type == VDIR) {
                tdirtrunc(tp);
                ASSERT(tp->tn_nlink == 0);
        }
        return (0);
}

/*
 * tdirinit is used internally to initialize a directory (dir)
 * with '.' and '..' entries without checking permissions and locking
 */
void
tdirinit(
        struct tmpnode *parent,         /* parent of directory to initialize */
        struct tmpnode *dir)            /* the new directory */
{
        struct tdirent *dot, *dotdot;
        timestruc_t now;

        ASSERT(RW_WRITE_HELD(&parent->tn_rwlock));
        ASSERT(dir->tn_type == VDIR);

        dot = tmp_memalloc(sizeof (struct tdirent) + 2, TMP_MUSTHAVE);
        dotdot = tmp_memalloc(sizeof (struct tdirent) + 3, TMP_MUSTHAVE);

        /*
         * Initialize the entries
         */
        dot->td_tmpnode = dir;
        dot->td_offset = 0;
        dot->td_name = (char *)dot + sizeof (struct tdirent);
        dot->td_name[0] = '.';
        dot->td_parent = dir;
        tmpfs_hash_in(dot);

        dotdot->td_tmpnode = parent;
        dotdot->td_offset = 1;
        dotdot->td_name = (char *)dotdot + sizeof (struct tdirent);
        dotdot->td_name[0] = '.';
        dotdot->td_name[1] = '.';
        dotdot->td_parent = dir;
        tmpfs_hash_in(dotdot);

        /*
         * Initialize directory entry list.
         */
        dot->td_next = dotdot;
        dot->td_prev = dotdot;  /* dot's td_prev holds roving slot pointer */
        dotdot->td_next = NULL;
        dotdot->td_prev = dot;

        gethrestime(&now);
        dir->tn_mtime = now;
        dir->tn_ctime = now;

        /*
         * Link counts are special for the hidden attribute directory.
         * The only explicit reference in the name space is "." and
         * the reference through ".." is not counted on the parent
         * file. The attrdir is created as a side effect to lookup,
         * so don't change the ctime of the parent.
         * Since tdirinit is called with both dir and parent being the
         * same for the root vnode, we need to increment this before we set
         * tn_nlink = 2 below.
         */
        if (!(dir->tn_vnode->v_flag & V_XATTRDIR)) {
                INCR_COUNT(&parent->tn_nlink, &parent->tn_tlock);
                parent->tn_ctime = now;
        }

        dir->tn_dir = dot;
        dir->tn_size = 2 * sizeof (struct tdirent) + 5; /* dot and dotdot */
        dir->tn_dirents = 2;
        dir->tn_nlink = 2;
}


/*
 * tdirtrunc is called to remove all directory entries under this directory.
 */
void
tdirtrunc(struct tmpnode *dir)
{
        struct tdirent *tdp;
        struct tmpnode *tp;
        size_t namelen;
        timestruc_t now;
        int isvattrdir, isdotdot, skip_decr;

        ASSERT(RW_WRITE_HELD(&dir->tn_rwlock));
        ASSERT(dir->tn_type == VDIR);

        isvattrdir = (dir->tn_vnode->v_flag & V_XATTRDIR) ? 1 : 0;
        for (tdp = dir->tn_dir; tdp; tdp = dir->tn_dir) {
                ASSERT(tdp->td_next != tdp);
                ASSERT(tdp->td_prev != tdp);
                ASSERT(tdp->td_tmpnode);

                dir->tn_dir = tdp->td_next;
                namelen = strlen(tdp->td_name) + 1;

                /*
                 * Adjust the link counts to account for this directory
                 * entry removal. Hidden attribute directories may
                 * not be empty as they may be truncated as a side-
                 * effect of removing the parent. We do hold/rele
                 * operations to free up these tmpnodes.
                 *
                 * Skip the link count adjustment for parents of
                 * attribute directories as those link counts
                 * do not include the ".." reference in the hidden
                 * directories.
                 */
                tp = tdp->td_tmpnode;
                isdotdot = (strcmp("..", tdp->td_name) == 0);
                skip_decr = (isvattrdir && isdotdot);
                if (!skip_decr) {
                        ASSERT(tp->tn_nlink > 0);
                        DECR_COUNT(&tp->tn_nlink, &tp->tn_tlock);
                }

                tmpfs_hash_out(tdp);

                tmp_memfree(tdp, sizeof (struct tdirent) + namelen);
                dir->tn_size -= (sizeof (struct tdirent) + namelen);
                dir->tn_dirents--;
        }

        gethrestime(&now);
        dir->tn_mtime = now;
        dir->tn_ctime = now;

        ASSERT(dir->tn_dir == NULL);
        ASSERT(dir->tn_size == 0);
        ASSERT(dir->tn_dirents == 0);
}

/*
 * Check if the source directory is in the path of the target directory.
 * The target directory is locked by the caller.
 *
 * XXX - The source and target's should be different upon entry.
 */
static int
tdircheckpath(
        struct tmpnode *fromtp,
        struct tmpnode  *toparent,
        struct cred     *cred)
{
        int     error = 0;
        struct tmpnode *dir, *dotdot;
        struct tdirent *tdp;

        ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));

        tdp = tmpfs_hash_lookup("..", toparent, 1, &dotdot);
        if (tdp == NULL)
                return (ENOENT);

        ASSERT(dotdot);

        if (dotdot == toparent) {
                /* root of fs.  search trivially satisfied. */
                tmpnode_rele(dotdot);
                return (0);
        }
        for (;;) {
                /*
                 * Return error for cases like "mv c c/d",
                 * "mv c c/d/e" and so on.
                 */
                if (dotdot == fromtp) {
                        tmpnode_rele(dotdot);
                        error = EINVAL;
                        break;
                }
                dir = dotdot;
                error = tdirlookup(dir, "..", &dotdot, cred);
                if (error) {
                        tmpnode_rele(dir);
                        break;
                }
                /*
                 * We're okay if we traverse the directory tree up to
                 * the root directory and don't run into the
                 * parent directory.
                 */
                if (dir == dotdot) {
                        tmpnode_rele(dir);
                        tmpnode_rele(dotdot);
                        break;
                }
                tmpnode_rele(dir);
        }
        return (error);
}

static int
tdirrename(
        struct tmpnode *fromparent,     /* parent directory of source */
        struct tmpnode *fromtp,         /* source tmpnode */
        struct tmpnode *toparent,       /* parent directory of target */
        char *nm,                       /* entry we are trying to change */
        struct tmpnode *to,             /* target tmpnode */
        struct tdirent *where,          /* target tmpnode directory entry */
        struct cred *cred)              /* credentials */
{
        int error = 0;
        int doingdirectory;
        timestruc_t now;

#if defined(lint)
        nm = nm;
#endif
        ASSERT(RW_WRITE_HELD(&toparent->tn_rwlock));

        /*
         * Short circuit rename of something to itself.
         */
        if (fromtp == to)
                return (ESAME);         /* special KLUDGE error code */

        rw_enter(&fromtp->tn_rwlock, RW_READER);
        rw_enter(&to->tn_rwlock, RW_READER);

        /*
         * Check that everything is on the same filesystem.
         */
        if (to->tn_vnode->v_vfsp != toparent->tn_vnode->v_vfsp ||
            to->tn_vnode->v_vfsp != fromtp->tn_vnode->v_vfsp) {
                error = EXDEV;
                goto out;
        }

        /*
         * Must have write permission to rewrite target entry.
         * Check for stickyness.
         */
        if ((error = tmp_taccess(toparent, VWRITE, cred)) != 0 ||
            (error = tmp_sticky_remove_access(toparent, to, cred)) != 0)
                goto out;

        /*
         * Ensure source and target are compatible (both directories
         * or both not directories).  If target is a directory it must
         * be empty and have no links to it; in addition it must not
         * be a mount point, and both the source and target must be
         * writable.
         */
        doingdirectory = (fromtp->tn_type == VDIR);
        if (to->tn_type == VDIR) {
                if (!doingdirectory) {
                        error = EISDIR;
                        goto out;
                }
                /*
                 * vn_vfswlock will prevent mounts from using the directory
                 * until we are done.
                 */
                if (vn_vfswlock(TNTOV(to))) {
                        error = EBUSY;
                        goto out;
                }
                if (vn_mountedvfs(TNTOV(to)) != NULL) {
                        vn_vfsunlock(TNTOV(to));
                        error = EBUSY;
                        goto out;
                }

                mutex_enter(&to->tn_tlock);
                if (to->tn_dirents > 2 || to->tn_nlink > 2) {
                        mutex_exit(&to->tn_tlock);
                        vn_vfsunlock(TNTOV(to));
                        error = EEXIST; /* SIGH should be ENOTEMPTY */
                        /*
                         * Update atime because checking tn_dirents is
                         * logically equivalent to reading the directory
                         */
                        gethrestime(&to->tn_atime);
                        goto out;
                }
                mutex_exit(&to->tn_tlock);
        } else if (doingdirectory) {
                error = ENOTDIR;
                goto out;
        }

        tmpfs_hash_change(where, fromtp);
        gethrestime(&now);
        toparent->tn_mtime = now;
        toparent->tn_ctime = now;

        /*
         * Upgrade to write lock on "to" (i.e., the target tmpnode).
         */
        rw_exit(&to->tn_rwlock);
        rw_enter(&to->tn_rwlock, RW_WRITER);

        /*
         * Decrement the link count of the target tmpnode.
         */
        DECR_COUNT(&to->tn_nlink, &to->tn_tlock);
        to->tn_ctime = now;

        if (doingdirectory) {
                /*
                 * The entry for "to" no longer exists so release the vfslock.
                 */
                vn_vfsunlock(TNTOV(to));

                /*
                 * Decrement the target link count and delete all entires.
                 */
                tdirtrunc(to);
                ASSERT(to->tn_nlink == 0);

                /*
                 * Renaming a directory with the parent different
                 * requires that ".." be rewritten.  The window is
                 * still there for ".." to be inconsistent, but this
                 * is unavoidable, and a lot shorter than when it was
                 * done in a user process.
                 */
                if (fromparent != toparent)
                        tdirfixdotdot(fromtp, fromparent, toparent);
        }
out:
        rw_exit(&to->tn_rwlock);
        rw_exit(&fromtp->tn_rwlock);
        return (error);
}

static void
tdirfixdotdot(
        struct tmpnode  *fromtp,        /* child directory */
        struct tmpnode  *fromparent,    /* old parent directory */
        struct tmpnode  *toparent)      /* new parent directory */
{
        struct tdirent  *dotdot;

        ASSERT(RW_LOCK_HELD(&toparent->tn_rwlock));

        /*
         * Increment the link count in the new parent tmpnode
         */
        INCR_COUNT(&toparent->tn_nlink, &toparent->tn_tlock);
        gethrestime(&toparent->tn_ctime);

        dotdot = tmpfs_hash_lookup("..", fromtp, 0, NULL);

        ASSERT(dotdot->td_tmpnode == fromparent);
        dotdot->td_tmpnode = toparent;

        /*
         * Decrement the link count of the old parent tmpnode.
         * If fromparent is NULL, then this is a new directory link;
         * it has no parent, so we need not do anything.
         */
        if (fromparent != NULL) {
                mutex_enter(&fromparent->tn_tlock);
                if (fromparent->tn_nlink != 0) {
                        fromparent->tn_nlink--;
                        gethrestime(&fromparent->tn_ctime);
                }
                mutex_exit(&fromparent->tn_tlock);
        }
}

static int
tdiraddentry(
        struct tmpnode  *dir,   /* target directory to make entry in */
        struct tmpnode  *tp,    /* new tmpnode */
        char            *name,
        enum de_op      op,
        struct tmpnode  *fromtp)
{
        struct tdirent *tdp, *tpdp;
        size_t          namelen, alloc_size;
        timestruc_t     now;

        /*
         * Make sure the parent directory wasn't removed from
         * underneath the caller.
         */
        if (dir->tn_dir == NULL)
                return (ENOENT);

        /*
         * Check that everything is on the same filesystem.
         */
        if (tp->tn_vnode->v_vfsp != dir->tn_vnode->v_vfsp)
                return (EXDEV);

        /*
         * Allocate and initialize directory entry
         */
        namelen = strlen(name) + 1;
        alloc_size = namelen + sizeof (struct tdirent);
        tdp = tmp_memalloc(alloc_size, 0);
        if (tdp == NULL)
                return (ENOSPC);

        if ((op == DE_RENAME) && (tp->tn_type == VDIR))
                tdirfixdotdot(tp, fromtp, dir);

        dir->tn_size += alloc_size;
        dir->tn_dirents++;
        tdp->td_tmpnode = tp;
        tdp->td_parent = dir;

        /*
         * The directory entry and its name were allocated sequentially.
         */
        tdp->td_name = (char *)tdp + sizeof (struct tdirent);
        (void) strcpy(tdp->td_name, name);

        tmpfs_hash_in(tdp);

        /*
         * Some utilities expect the size of a directory to remain
         * somewhat static.  For example, a routine which unlinks
         * files between calls to readdir(); the size of the
         * directory changes from underneath it and so the real
         * directory offset in bytes is invalid.  To circumvent
         * this problem, we initialize a directory entry with an
         * phony offset, and use this offset to determine end of
         * file in tmp_readdir.
         */
        tpdp = dir->tn_dir->td_prev;
        /*
         * Install at first empty "slot" in directory list.
         */
        while (tpdp->td_next != NULL && (tpdp->td_next->td_offset -
            tpdp->td_offset) <= 1) {
                ASSERT(tpdp->td_next != tpdp);
                ASSERT(tpdp->td_prev != tpdp);
                ASSERT(tpdp->td_next->td_offset > tpdp->td_offset);
                tpdp = tpdp->td_next;
        }
        tdp->td_offset = tpdp->td_offset + 1;

        /*
         * If we're at the end of the dirent list and the offset (which
         * is necessarily the largest offset in this directory) is more
         * than twice the number of dirents, that means the directory is
         * 50% holes.  At this point we reset the slot pointer back to
         * the beginning of the directory so we start using the holes.
         * The idea is that if there are N dirents, there must also be
         * N holes, so we can satisfy the next N creates by walking at
         * most 2N entries; thus the average cost of a create is constant.
         * Note that we use the first dirent's td_prev as the roving
         * slot pointer; it's ugly, but it saves a word in every dirent.
         */
        if (tpdp->td_next == NULL && tpdp->td_offset > 2 * dir->tn_dirents)
                dir->tn_dir->td_prev = dir->tn_dir->td_next;
        else
                dir->tn_dir->td_prev = tdp;

        ASSERT(tpdp->td_next != tpdp);
        ASSERT(tpdp->td_prev != tpdp);

        tdp->td_next = tpdp->td_next;
        if (tdp->td_next) {
                tdp->td_next->td_prev = tdp;
        }
        tdp->td_prev = tpdp;
        tpdp->td_next = tdp;

        ASSERT(tdp->td_next != tdp);
        ASSERT(tdp->td_prev != tdp);
        ASSERT(tpdp->td_next != tpdp);
        ASSERT(tpdp->td_prev != tpdp);

        gethrestime(&now);
        dir->tn_mtime = now;
        dir->tn_ctime = now;

        return (0);
}

static int
tdirmaketnode(
        struct tmpnode *dir,
        struct tmount   *tm,
        struct vattr    *va,
        enum    de_op   op,
        struct tmpnode **newnode,
        struct cred     *cred)
{
        struct tmpnode *tp;
        enum vtype      type;

        ASSERT(va != NULL);
        ASSERT(op == DE_CREATE || op == DE_MKDIR);
        if (((va->va_mask & AT_ATIME) && TIMESPEC_OVERFLOW(&va->va_atime)) ||
            ((va->va_mask & AT_MTIME) && TIMESPEC_OVERFLOW(&va->va_mtime)))
                return (EOVERFLOW);
        type = va->va_type;
        tp = tmp_memalloc(sizeof (struct tmpnode), TMP_MUSTHAVE);
        tmpnode_init(tm, tp, va, cred);

        /* setup normal file/dir's extended attribute directory */
        if (dir->tn_flags & ISXATTR) {
                /* parent dir is , mark file as xattr */
                tp->tn_flags |= ISXATTR;
        }


        if (type == VBLK || type == VCHR) {
                tp->tn_vnode->v_rdev = tp->tn_rdev = va->va_rdev;
        } else {
                tp->tn_vnode->v_rdev = tp->tn_rdev = NODEV;
        }
        tp->tn_vnode->v_type = type;
        tp->tn_uid = crgetuid(cred);

        /*
         * To determine the group-id of the created file:
         *   1) If the gid is set in the attribute list (non-Sun & pre-4.0
         *      clients are not likely to set the gid), then use it if
         *      the process is privileged, belongs to the target group,
         *      or the group is the same as the parent directory.
         *   2) If the filesystem was not mounted with the Old-BSD-compatible
         *      GRPID option, and the directory's set-gid bit is clear,
         *      then use the process's gid.
         *   3) Otherwise, set the group-id to the gid of the parent directory.
         */
        if ((va->va_mask & AT_GID) &&
            ((va->va_gid == dir->tn_gid) || groupmember(va->va_gid, cred) ||
            secpolicy_vnode_create_gid(cred) == 0)) {
                /*
                 * XXX - is this only the case when a 4.0 NFS client, or a
                 * client derived from that code, makes a call over the wire?
                 */
                tp->tn_gid = va->va_gid;
        } else {
                if (dir->tn_mode & VSGID)
                        tp->tn_gid = dir->tn_gid;
                else
                        tp->tn_gid = crgetgid(cred);
        }
        /*
         * If we're creating a directory, and the parent directory has the
         * set-GID bit set, set it on the new directory.
         * Otherwise, if the user is neither privileged nor a member of the
         * file's new group, clear the file's set-GID bit.
         */
        if (dir->tn_mode & VSGID && type == VDIR)
                tp->tn_mode |= VSGID;
        else {
                if ((tp->tn_mode & VSGID) &&
                    secpolicy_vnode_setids_setgids(cred, tp->tn_gid) != 0)
                        tp->tn_mode &= ~VSGID;
        }

        if (va->va_mask & AT_ATIME)
                tp->tn_atime = va->va_atime;
        if (va->va_mask & AT_MTIME)
                tp->tn_mtime = va->va_mtime;

        if (op == DE_MKDIR)
                tdirinit(dir, tp);

        *newnode = tp;
        return (0);
}