root/usr/src/lib/lib9p/common/backend/fs.c
/*
 * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
 * All rights reserved
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted providing that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 *
 * Copyright 2021 Joyent, Inc.
 */

/*
 * Based on libixp code: ©2007-2010 Kris Maglione <maglione.k at Gmail>
 */

#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <stdbool.h>
#include <fcntl.h>
#include <errno.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mount.h>
#include <sys/param.h>
#include <sys/queue.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <dirent.h>
#include <pwd.h>
#include <grp.h>
#include <libgen.h>
#include <pthread.h>
#include "../lib9p.h"
#include "../lib9p_impl.h"
#include "../fid.h"
#include "../log.h"
#include "../rfuncs.h"
#include "../genacl.h"
#include "backend.h"
#include "fs.h"

#if defined(WITH_CASPER)
  #include <libcasper.h>
  #include <casper/cap_pwd.h>
  #include <casper/cap_grp.h>
#endif

#if defined(__FreeBSD__)
  #include <sys/param.h>
  #if __FreeBSD_version >= 1000000
    #define     HAVE_BINDAT
  #endif
#endif

#if defined(__FreeBSD__)
  #define       HAVE_BIRTHTIME
#endif

#if defined(__APPLE__)
  #include <sys/syscall.h>
  #include "Availability.h"
  #define ACL_TYPE_NFS4 ACL_TYPE_EXTENDED
#endif

#if defined (__illumos__)
  #include <sys/sysmacros.h>
  #include <sys/statvfs.h>
  #include <sys/un.h>
  #include <attr.h>
  #include <sys/nvpair.h>
#endif

struct fs_softc {
        int     fs_rootfd;
        bool    fs_readonly;
#if defined(__illumos__)
        /*
         * On illumos, the file creation time (birthtime) is stored (on
         * supported filesystems -- i.e. zfs) in an extended attribute.
         * If for some reason the fs doesn't support extended attributes,
         * we skip trying to read the creation time.
         */
        bool    fs_hasxattr;
#endif
#if defined(WITH_CASPER)
        cap_channel_t *fs_cappwd;
        cap_channel_t *fs_capgrp;
#endif
};

struct fs_fid {
        DIR     *ff_dir;
        int     ff_dirfd;
        int     ff_fd;
        int     ff_flags;
        char    *ff_name;
        struct fs_authinfo *ff_ai;
        pthread_mutex_t ff_mtx;
        struct l9p_acl *ff_acl; /* cached ACL if any */
};

#if defined(__FreeBSD__)
# define        STATFS_FSID(_s) \
        (((uint64_t)(_s)->f_fsid.val[0] << 32) | (uint64_t)(_s)->f_fsid.val[1])

# define        STAT_ATIME(_s)  ((_s)->st_atimespec)
# define        STAT_MTIME(_s)  ((_s)->st_mtimespec)
# define        STAT_CTIME(_s)  ((_s)->st_ctimespec)
#elif defined (__illumos__)
# define        STATFS_FSID(_s) ((_s)->f_fsid)

# define        STAT_ATIME(_s)  ((_s)->st_atim)
# define        STAT_MTIME(_s)  ((_s)->st_mtim)
# define        STAT_CTIME(_s)  ((_s)->st_ctim)
#else
#error "Port me"
#endif

#define FF_NO_NFSV4_ACL 0x01    /* don't go looking for NFSv4 ACLs */
/*      FF_NO_POSIX_ACL 0x02    -- not yet */

/*
 * Our authinfo consists of:
 *
 *  - a reference count
 *  - a uid
 *  - a gid-set
 *
 * The "default" gid is the first gid in the git-set, provided the
 * set size is at least 1.  The set-size may be zero, though.
 *
 * Adjustments to the ref-count must be atomic, once it's shared.
 * It would be nice to use C11 atomics here but they are not common
 * enough to all systems just yet; for now, we use a mutex.
 *
 * Note that some ops (Linux style ones) pass an effective gid for
 * the op, in which case, that gid may override.  To achieve this
 * effect, permissions testing functions also take an extra gid.
 * If this gid is (gid_t)-1 it is not used and only the remaining
 * gids take part.
 *
 * The uid may also be (uid_t)-1, meaning "no uid was available
 * at all at attach time".  In this case, new files inherit parent
 * directory uids.
 *
 * The refcount is simply the number of "openfile"s using this
 * authinfo (so that when the last ref goes away, we can free it).
 *
 * There are also master ACL flags (same as in ff_flags).
 */
struct fs_authinfo {
        pthread_mutex_t ai_mtx; /* lock for refcnt */
        uint32_t ai_refcnt;
        int     ai_flags;
        uid_t   ai_uid;
        int     ai_ngids;
        gid_t   ai_gids[];      /* NB: flexible array member */
};

/*
 * We have a global-static mutex for single-threading Tattach
 * requests, which use getpwnam (and indirectly, getgr* functions)
 * which are not reentrant.
 */
static bool fs_attach_mutex_inited;
static pthread_mutex_t fs_attach_mutex;

static pthread_mutexattr_t fs_mutexattr;

/*
 * Internal functions (except inline functions).
 */
static struct passwd *fs_getpwuid(struct fs_softc *, uid_t, struct r_pgdata *);
static struct group *fs_getgrgid(struct fs_softc *, gid_t, struct r_pgdata *);
static int fs_buildname(struct l9p_fid *, char *, char *, size_t);
static int fs_pdir(struct fs_softc *, struct l9p_fid *, char *, size_t,
    struct stat *st);
static int fs_dpf(char *, char *, size_t);
static int fs_oflags_dotu(int, int *);
static int fs_oflags_dotl(uint32_t, int *, enum l9p_omode *);
static int fs_nde(struct fs_softc *, struct l9p_fid *, bool, gid_t,
    struct stat *, uid_t *, gid_t *);
static struct fs_fid *open_fid(int, const char *, struct fs_authinfo *, bool);
static void dostat(struct fs_softc *, struct l9p_stat *, char *,
    struct stat *, bool dotu);
#ifdef __illumos__
static void getcrtime(struct fs_softc *, int, const char *, uint64_t *,
    uint64_t *);
static void dostatfs(struct l9p_statfs *, struct statvfs *, long);
#define ACL_TYPE_NFS4 1
acl_t *acl_get_fd_np(int fd, int type);
#else
static void dostatfs(struct l9p_statfs *, struct statfs *, long);
#endif
static void fillacl(struct fs_fid *ff);
static struct l9p_acl *getacl(struct fs_fid *ff, int fd, const char *path);
static void dropacl(struct fs_fid *ff);
static struct l9p_acl *look_for_nfsv4_acl(struct fs_fid *ff, int fd,
    const char *path);
static int check_access(int32_t,
    struct l9p_acl *, struct stat *, struct l9p_acl *, struct stat *,
    struct fs_authinfo *, gid_t);
static void generate_qid(struct stat *, struct l9p_qid *);

static int fs_icreate(void *, struct l9p_fid *, char *, int,
    bool, mode_t, gid_t, struct stat *);
static int fs_iopen(void *, struct l9p_fid *, int, enum l9p_omode,
    gid_t, struct stat *);
static int fs_imkdir(void *, struct l9p_fid *, char *,
    bool, mode_t, gid_t, struct stat *);
static int fs_imkfifo(void *, struct l9p_fid *, char *,
    bool, mode_t, gid_t, struct stat *);
static int fs_imknod(void *, struct l9p_fid *, char *,
    bool, mode_t, dev_t, gid_t, struct stat *);
static int fs_imksocket(void *, struct l9p_fid *, char *,
    bool, mode_t, gid_t, struct stat *);
static int fs_isymlink(void *, struct l9p_fid *, char *, char *,
    gid_t, struct stat *);

/*
 * Internal functions implementing backend.
 */
static int fs_attach(void *, struct l9p_request *);
static int fs_clunk(void *, struct l9p_fid *);
static int fs_create(void *, struct l9p_request *);
static int fs_open(void *, struct l9p_request *);
static int fs_read(void *, struct l9p_request *);
static int fs_remove(void *, struct l9p_fid *);
static int fs_stat(void *, struct l9p_request *);
static int fs_walk(void *, struct l9p_request *);
static int fs_write(void *, struct l9p_request *);
static int fs_wstat(void *, struct l9p_request *);
static int fs_statfs(void *, struct l9p_request *);
static int fs_lopen(void *, struct l9p_request *);
static int fs_lcreate(void *, struct l9p_request *);
static int fs_symlink(void *, struct l9p_request *);
static int fs_mknod(void *, struct l9p_request *);
static int fs_rename(void *, struct l9p_request *);
static int fs_readlink(void *, struct l9p_request *);
static int fs_getattr(void *, struct l9p_request *);
static int fs_setattr(void *, struct l9p_request *);
static int fs_xattrwalk(void *, struct l9p_request *);
static int fs_xattrcreate(void *, struct l9p_request *);
static int fs_readdir(void *, struct l9p_request *);
static int fs_fsync(void *, struct l9p_request *);
static int fs_lock(void *, struct l9p_request *);
static int fs_getlock(void *, struct l9p_request *);
static int fs_link(void *, struct l9p_request *);
static int fs_renameat(void *, struct l9p_request *);
static int fs_unlinkat(void *, struct l9p_request *);
static void fs_freefid(void *, struct l9p_fid *);

/*
 * Convert from 9p2000 open/create mode to Unix-style O_* flags.
 * This includes 9p2000.u extensions, but not 9p2000.L protocol,
 * which has entirely different open, create, etc., flag bits.
 *
 * The <mode> given here is the one-byte (uint8_t) "mode"
 * argument to Tcreate or Topen, so it can have at most 8 bits.
 *
 * https://swtch.com/plan9port/man/man9/open.html and
 * http://plan9.bell-labs.com/magic/man2html/5/open
 * both say:
 *
 *   The [low two bits of the] mode field determines the
 *   type of I/O ... [I]f mode has the OTRUNC (0x10) bit
 *   set, the file is to be truncated, which requires write
 *   permission ...; if the mode has the ORCLOSE (0x40) bit
 *   set, the file is to be removed when the fid is clunked,
 *   which requires permission to remove the file from its
 *   directory.  All other bits in mode should be zero.  It
 *   is illegal to write a directory, truncate it, or
 *   attempt to remove it on close.
 *
 * 9P2000.u may add ODIRECT (0x80); this is not completely clear.
 * The fcall.h header defines OCEXEC (0x20) as well, but it makes
 * no sense to send this to a server.  There seem to be no bits
 * 0x04 and 0x08.
 *
 * We always turn on O_NOCTTY since as a server, we never want
 * to gain a controlling terminal.  We always turn on O_NOFOLLOW
 * for reasons described elsewhere.
 */
static int
fs_oflags_dotu(int mode, int *aflags)
{
        int flags;
#define CONVERT(theirs, ours) \
        do { \
                if (mode & (theirs)) { \
                        mode &= ~(theirs); \
                        flags |= ours; \
                } \
        } while (0)

        switch (mode & L9P_OACCMODE) {

        case L9P_OREAD:
        default:
                flags = O_RDONLY;
                break;

        case L9P_OWRITE:
                flags = O_WRONLY;
                break;

        case L9P_ORDWR:
                flags = O_RDWR;
                break;

        case L9P_OEXEC:
                if (mode & L9P_OTRUNC)
                        return (EINVAL);
                flags = O_RDONLY;
                break;
        }

        flags |= O_NOCTTY | O_NOFOLLOW;

        CONVERT(L9P_OTRUNC, O_TRUNC);

        /*
         * Now take away some flags locally:
         *   the access mode (already translated)
         *   ORCLOSE - caller only
         *   OCEXEC - makes no sense in server
         *   ODIRECT - not applicable here
         * If there are any flag bits left after this,
         * we were unable to translate them.  For now, let's
         * treat this as EINVAL so that we can catch problems.
         */
        mode &= ~(L9P_OACCMODE | L9P_ORCLOSE | L9P_OCEXEC | L9P_ODIRECT);
        if (mode != 0) {
                L9P_LOG(L9P_INFO,
                    "fs_oflags_dotu: untranslated bits: %#x",
                    (unsigned)mode);
                return (EINVAL);
        }

        *aflags = flags;
        return (0);
#undef CONVERT
}

/*
 * Convert from 9P2000.L (Linux) open mode bits to O_* flags.
 * See fs_oflags_dotu above.
 *
 * Linux currently does not have open-for-exec, but there is a
 * proposal for it using O_PATH|O_NOFOLLOW, now handled here.
 *
 * We may eventually also set L9P_ORCLOSE for L_O_TMPFILE.
 */
static int
fs_oflags_dotl(uint32_t l_mode, int *aflags, enum l9p_omode *ap9)
{
        int flags;
        enum l9p_omode p9;
#define CLEAR(theirs)   l_mode &= ~(uint32_t)(theirs)
#define CONVERT(theirs, ours) \
        do { \
                if (l_mode & (theirs)) { \
                        CLEAR(theirs); \
                        flags |= ours; \
                } \
        } while (0)

        /*
         * Linux O_RDONLY, O_WRONLY, O_RDWR (0,1,2) match BSD/MacOS.
         */
        flags = l_mode & O_ACCMODE;
        if (flags == 3)
                return (EINVAL);
        CLEAR(O_ACCMODE);

        if ((l_mode & (L9P_L_O_PATH | L9P_L_O_NOFOLLOW)) ==
                    (L9P_L_O_PATH | L9P_L_O_NOFOLLOW)) {
                CLEAR(L9P_L_O_PATH | L9P_L_O_NOFOLLOW);
                p9 = L9P_OEXEC;
        } else {
                /*
                 * Slightly dirty, but same dirt, really, as
                 * setting flags from l_mode & O_ACCMODE.
                 */
                p9 = (enum l9p_omode)flags;     /* slightly dirty */
        }

        /* turn L_O_TMPFILE into L9P_ORCLOSE in *p9? */
        if (l_mode & L9P_L_O_TRUNC)
                p9 |= L9P_OTRUNC;       /* but don't CLEAR yet */

        flags |= O_NOCTTY | O_NOFOLLOW;

        /*
         * L_O_CREAT seems to be noise, since we get separate open
         * and create.  But it is actually set sometimes.  We just
         * throw it out here; create ops must set it themselves and
         * open ops have no permissions bits and hence cannot create.
         *
         * L_O_EXCL does make sense on create ops, i.e., we can
         * take a create op with or without L_O_EXCL.  We pass that
         * through.
         */
        CLEAR(L9P_L_O_CREAT);
        CONVERT(L9P_L_O_EXCL, O_EXCL);
        CONVERT(L9P_L_O_TRUNC, O_TRUNC);
        CONVERT(L9P_L_O_DIRECTORY, O_DIRECTORY);
        CONVERT(L9P_L_O_APPEND, O_APPEND);
        CONVERT(L9P_L_O_NONBLOCK, O_NONBLOCK);

        /*
         * Discard these as useless noise at our (server) end.
         * (NOATIME might be useful but we can only set it on a
         * per-mount basis.)
         */
        CLEAR(L9P_L_O_CLOEXEC);
        CLEAR(L9P_L_O_DIRECT);
        CLEAR(L9P_L_O_DSYNC);
        CLEAR(L9P_L_O_FASYNC);
        CLEAR(L9P_L_O_LARGEFILE);
        CLEAR(L9P_L_O_NOATIME);
        CLEAR(L9P_L_O_NOCTTY);
        CLEAR(L9P_L_O_NOFOLLOW);
        CLEAR(L9P_L_O_SYNC);

        if (l_mode != 0) {
                L9P_LOG(L9P_INFO,
                    "fs_oflags_dotl: untranslated bits: %#x",
                    (unsigned)l_mode);
                return (EINVAL);
        }

        *aflags = flags;
        *ap9 = p9;
        return (0);
#undef CLEAR
#undef CONVERT
}

static struct passwd *
fs_getpwuid(struct fs_softc *sc, uid_t uid, struct r_pgdata *pg)
{
#if defined(WITH_CASPER)
        return (r_cap_getpwuid(sc->fs_cappwd, uid, pg));
#else
        (void)sc;
        return (r_getpwuid(uid, pg));
#endif
}

static struct group *
fs_getgrgid(struct fs_softc *sc, gid_t gid, struct r_pgdata *pg)
{
#if defined(WITH_CASPER)
        return (r_cap_getgrgid(sc->fs_capgrp, gid, pg));
#else
        (void)sc;
        return (r_getgrgid(gid, pg));
#endif
}

/*
 * Build full name of file by appending given name to directory name.
 */
static int
fs_buildname(struct l9p_fid *dir, char *name, char *buf, size_t size)
{
        struct fs_fid *dirf = dir->lo_aux;
        size_t dlen, nlen1;

        assert(dirf != NULL);
        dlen = strlen(dirf->ff_name);
        nlen1 = strlen(name) + 1;       /* +1 for '\0' */
        if (dlen + 1 + nlen1 > size)
                return (ENAMETOOLONG);
        memcpy(buf, dirf->ff_name, dlen);
        buf[dlen] = '/';
        memcpy(buf + dlen + 1, name, nlen1);
        return (0);
}

/*
 * Build parent name of file by splitting it off.  Return an error
 * if the given fid represents the root, so that there is no such
 * parent, or if the discovered parent is not a directory.
 */
static int
fs_pdir(struct fs_softc *sc __unused, struct l9p_fid *fid, char *buf,
    size_t size, struct stat *st)
{
        struct fs_fid *ff;
        char *path;

        ff = fid->lo_aux;
        assert(ff != NULL);
        path = ff->ff_name;
        path = r_dirname(path, buf, size);
        if (path == NULL)
                return (ENAMETOOLONG);
        if (fstatat(ff->ff_dirfd, path, st, AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);
        if (!S_ISDIR(st->st_mode))
                return (ENOTDIR);
        return (0);
}

/*
 * Like fs_buildname() but for adding a file name to a buffer
 * already holding a directory name.  Essentially does
 *     strcat(dbuf, "/");
 *     strcat(dbuf, fname);
 * but with size checking and an ENAMETOOLONG error as needed.
 *
 * (Think of the function name as "directory plus-equals file".)
 */
static int
fs_dpf(char *dbuf, char *fname, size_t size)
{
        size_t dlen, nlen1;

        dlen = strlen(dbuf);
        nlen1 = strlen(fname) + 1;
        if (dlen + 1 + nlen1 > size)
                return (ENAMETOOLONG);
        dbuf[dlen] = '/';
        memcpy(dbuf + dlen + 1, fname, nlen1);
        return (0);
}

/*
 * Prepare to create a new directory entry (open with O_CREAT,
 * mkdir, etc -- any operation that creates a new inode),
 * operating in parent data <dir>, based on authinfo <ai> and
 * effective gid <egid>.
 *
 * The new entity should be owned by user/group <*nuid, *ngid>,
 * if it's really a new entity.  It will be a directory if isdir.
 *
 * Returns an error number if the entry should not be created
 * (e.g., read-only file system or no permission to write in
 * parent directory).  Always sets *nuid and *ngid on success:
 * in the worst case, when there is no available ID, this will
 * use the parent directory's IDs.  Fills in <*st> on success.
 */
static int
fs_nde(struct fs_softc *sc, struct l9p_fid *dir, bool isdir, gid_t egid,
    struct stat *st, uid_t *nuid, gid_t *ngid)
{
        struct fs_fid *dirf;
        struct fs_authinfo *ai;
        int32_t op;
        int error;

        if (sc->fs_readonly)
                return (EROFS);
        dirf = dir->lo_aux;
        assert(dirf != NULL);
        if (fstatat(dirf->ff_dirfd, dirf->ff_name, st,
            AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);
        if (!S_ISDIR(st->st_mode))
                return (ENOTDIR);
        dirf = dir->lo_aux;
        ai = dirf->ff_ai;
        fillacl(dirf);
        op = isdir ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
        error = check_access(op, dirf->ff_acl, st, NULL, NULL, ai, egid);
        if (error)
                return (EPERM);

        *nuid = ai->ai_uid != (uid_t)-1 ? ai->ai_uid : st->st_uid;
        *ngid = egid != (gid_t)-1 ? egid :
            ai->ai_ngids > 0 ?  ai->ai_gids[0] : st->st_gid;
        return (0);
}

/*
 * Allocate new open-file data structure to attach to a fid.
 *
 * The new file's authinfo is the same as the old one's, and
 * we gain a reference.
 */
static struct fs_fid *
open_fid(int dirfd, const char *path, struct fs_authinfo *ai, bool creating)
{
        struct fs_fid *ret;
        uint32_t newcount;
        int error;

        ret = l9p_calloc(1, sizeof(*ret));
#ifdef __illumos__
        error = pthread_mutex_init(&ret->ff_mtx, &fs_mutexattr);
#else
        error = pthread_mutex_init(&ret->ff_mtx, NULL);
#endif
        if (error) {
                free(ret);
                return (NULL);
        }
        ret->ff_fd = -1;
        ret->ff_dirfd = dirfd;
        ret->ff_name = strdup(path);
        if (ret->ff_name == NULL) {
                (void) pthread_mutex_destroy(&ret->ff_mtx);
                free(ret);
                return (NULL);
        }
        if (pthread_mutex_lock(&ai->ai_mtx) != 0) {
                (void) pthread_mutex_destroy(&ret->ff_mtx);
                free(ret->ff_name);
                free(ret);
                return (NULL);
        }
        newcount = ++ai->ai_refcnt;
        (void) pthread_mutex_unlock(&ai->ai_mtx);
        /*
         * If we just incremented the count to 1, we're the *first*
         * reference.  This is only allowed when creating the authinfo,
         * otherwise it means something has gone wrong.  This cannot
         * catch every bad (re)use of a freed authinfo but it may catch
         * a few.
         */
        assert(newcount > 1 || creating);
        L9P_LOG(L9P_DEBUG, "authinfo %p now used by %lu",
            (void *)ai, (u_long)newcount);
        ret->ff_ai = ai;
        return (ret);
}

static void
dostat(struct fs_softc *sc, struct l9p_stat *s, char *name,
    struct stat *buf, bool dotu)
{
        struct passwd *user;
        struct group *group;

        memset(s, 0, sizeof(struct l9p_stat));

        generate_qid(buf, &s->qid);

        s->type = 0;
        s->dev = 0;
        s->mode = buf->st_mode & 0777;

        if (S_ISDIR(buf->st_mode))
                s->mode |= L9P_DMDIR;

        if (S_ISLNK(buf->st_mode) && dotu)
                s->mode |= L9P_DMSYMLINK;

        if (S_ISCHR(buf->st_mode) || S_ISBLK(buf->st_mode))
                s->mode |= L9P_DMDEVICE;

        if (S_ISSOCK(buf->st_mode))
                s->mode |= L9P_DMSOCKET;

        if (S_ISFIFO(buf->st_mode))
                s->mode |= L9P_DMNAMEDPIPE;

        s->atime = (uint32_t)buf->st_atime;
        s->mtime = (uint32_t)buf->st_mtime;
        s->length = (uint64_t)buf->st_size;

        s->name = r_basename(name, NULL, 0);

        if (!dotu) {
                struct r_pgdata udata, gdata;

                user = fs_getpwuid(sc, buf->st_uid, &udata);
                group = fs_getgrgid(sc, buf->st_gid, &gdata);
                s->uid = user != NULL ? strdup(user->pw_name) : NULL;
                s->gid = group != NULL ? strdup(group->gr_name) : NULL;
                s->muid = user != NULL ? strdup(user->pw_name) : NULL;
                r_pgfree(&udata);
                r_pgfree(&gdata);
        } else {
                /*
                 * When using 9P2000.u, we don't need to bother about
                 * providing user and group names in textual form.
                 *
                 * NB: if the asprintf()s fail, s->extension should
                 * be unset so we can ignore these.
                 */
                s->n_uid = buf->st_uid;
                s->n_gid = buf->st_gid;
                s->n_muid = buf->st_uid;

                if (S_ISLNK(buf->st_mode)) {
                        char target[MAXPATHLEN];
                        ssize_t ret = readlink(name, target, MAXPATHLEN);

                        if (ret < 0) {
                                s->extension = NULL;
                                return;
                        }

                        s->extension = strndup(target, (size_t)ret);
                }

                if (S_ISBLK(buf->st_mode)) {
                        asprintf(&s->extension, "b %d %d", major(buf->st_rdev),
                            minor(buf->st_rdev));
                }

                if (S_ISCHR(buf->st_mode)) {
                        asprintf(&s->extension, "c %d %d", major(buf->st_rdev),
                            minor(buf->st_rdev));
                }
        }
}

#ifndef __illumos__
static void
dostatfs(struct l9p_statfs *out, struct statfs *in, long namelen)
#else
static void
dostatfs(struct l9p_statfs *out, struct statvfs *in, long namelen)
#endif
{

        out->type = L9P_FSTYPE;
        out->bsize = in->f_bsize;
#ifndef __illumos__
        out->blocks = in->f_blocks;
        out->bfree = in->f_bfree;
        out->bavail = in->f_bavail;
#else
        out->blocks = in->f_blocks * in->f_frsize / in->f_bsize;
        out->bfree = in->f_bfree * in->f_frsize / in->f_bsize;
        out->bavail = in->f_bavail * in->f_frsize / in->f_bsize;
#endif
        out->files = in->f_files;
        out->ffree = in->f_ffree;
        out->namelen = (uint32_t)namelen;
        out->fsid = STATFS_FSID(in);
}

static void
generate_qid(struct stat *buf, struct l9p_qid *qid)
{
        qid->path = buf->st_ino;
        qid->version = 0;

        if (S_ISREG(buf->st_mode))
                qid->type |= L9P_QTFILE;

        if (S_ISDIR(buf->st_mode))
                qid->type |= L9P_QTDIR;

        if (S_ISLNK(buf->st_mode))
                qid->type |= L9P_QTSYMLINK;
}

/*
 * Fill in ff->ff_acl if it's not set yet.  Skip if the "don't use
 * ACLs" flag is set, and use the flag to remember failure so
 * we don't bother retrying either.
 */
static void
fillacl(struct fs_fid *ff)
{

        if (ff->ff_acl == NULL && (ff->ff_flags & FF_NO_NFSV4_ACL) == 0) {
                ff->ff_acl = look_for_nfsv4_acl(ff, ff->ff_fd, ff->ff_name);
                if (ff->ff_acl == NULL)
                        ff->ff_flags |= FF_NO_NFSV4_ACL;
        }
}

/*
 * Get an ACL given fd and/or path name.  We check for the "don't get
 * ACL" flag in the given ff_fid data structure first, but don't set
 * the flag here.  The fillacl() code is similar but will set the
 * flag; it also uses the ff_fd and ff_name directly.
 *
 * (This is used to get ACLs for parent directories, for instance.)
 */
static struct l9p_acl *
getacl(struct fs_fid *ff, int fd, const char *path)
{

        if (ff->ff_flags & FF_NO_NFSV4_ACL)
                return (NULL);
        return look_for_nfsv4_acl(ff, fd, path);
}

/*
 * Drop cached ff->ff_acl, e.g., after moving from one directory to
 * another, where inherited ACLs might change.
 */
static void
dropacl(struct fs_fid *ff)
{

        l9p_acl_free(ff->ff_acl);
        ff->ff_acl = NULL;
        ff->ff_flags = ff->ff_ai->ai_flags;
}

/*
 * Check to see if we can find NFSv4 ACLs for the given file.
 * If we have an open fd, we can use that, otherwise we need
 * to use the path.
 */
static struct l9p_acl *
look_for_nfsv4_acl(struct fs_fid *ff, int fd, const char *path)
{
        struct l9p_acl *acl;
#ifdef __illumos__
        acl_t *sysacl;
#else
        acl_t sysacl;
#endif
        int doclose = 0;

        if (fd < 0) {
                fd = openat(ff->ff_dirfd, path, 0);
                doclose = 1;
        }

        sysacl = acl_get_fd_np(fd, ACL_TYPE_NFS4);
        if (sysacl == NULL) {
                /*
                 * EINVAL means no NFSv4 ACLs apply for this file.
                 * Other error numbers indicate some kind of problem.
                 */
                if (errno != EINVAL) {
                        L9P_LOG(L9P_ERROR,
                            "error retrieving NFSv4 ACL from "
                            "fdesc %d (%s): %s", fd,
                            path, strerror(errno));
                }

                if (doclose)
                        close(fd);

                return (NULL);
        }
#if defined(HAVE_FREEBSD_ACLS)
        acl = l9p_freebsd_nfsv4acl_to_acl(sysacl);
#elif defined(HAVE__ILLUMOS_ACLS)
        acl = l9p_illumos_nfsv4acl_to_acl(sysacl);
#else
        acl = NULL; /* XXX need a l9p_darwin_acl_to_acl */
#endif
        acl_free(sysacl);

        if (doclose)
                close(fd);

        return (acl);
}

/*
 * Verify that the user whose authinfo is in <ai> and effective
 * group ID is <egid> ((gid_t)-1 means no egid supplied) has
 * permission to do something.
 *
 * The "something" may be rather complex: we allow NFSv4 style
 * operation masks here, and provide parent and child ACLs and
 * stat data.  At most one of pacl+pst and cacl+cst can be NULL,
 * unless ACLs are not supported; then pacl and cacl can both
 * be NULL but pst or cst must be non-NULL depending on the
 * operation.
 */
static int
check_access(int32_t opmask,
    struct l9p_acl *pacl, struct stat *pst,
    struct l9p_acl *cacl, struct stat *cst,
    struct fs_authinfo *ai, gid_t egid)
{
        struct l9p_acl_check_args args;

        /*
         * If we have ACLs, use them exclusively, ignoring Unix
         * permissions.  Otherwise, fall back on stat st_mode
         * bits, and allow super-user as well.
         */
        args.aca_uid = ai->ai_uid;
        args.aca_gid = egid;
        args.aca_groups = ai->ai_gids;
        args.aca_ngroups = (size_t)ai->ai_ngids;
        args.aca_parent = pacl;
        args.aca_pstat = pst;
        args.aca_child = cacl;
        args.aca_cstat = cst;
        args.aca_aclmode = pacl == NULL && cacl == NULL
            ? L9P_ACM_STAT_MODE
            : L9P_ACM_NFS_ACL | L9P_ACM_ZFS_ACL;

        args.aca_superuser = true;
        return (l9p_acl_check_access(opmask, &args));
}

static int
fs_attach(void *softc, struct l9p_request *req)
{
        struct fs_authinfo *ai;
        struct fs_softc *sc = (struct fs_softc *)softc;
        struct fs_fid *file;
        struct passwd *pwd;
        struct stat st;
        struct r_pgdata udata;
        uint32_t n_uname;
        gid_t *gids;
        uid_t uid;
        int error;
        int ngroups;

        assert(req->lr_fid != NULL);

        /*
         * Single-thread pwd/group related items.  We have a reentrant
         * r_getpwuid but not a reentrant r_getpwnam, and l9p_getgrlist
         * may use non-reentrant C library getgr* routines.
         */
        if ((error = pthread_mutex_lock(&fs_attach_mutex)) != 0)
                return (error);

        n_uname = req->lr_req.tattach.n_uname;
        if (n_uname != L9P_NONUNAME) {
                uid = (uid_t)n_uname;
                pwd = fs_getpwuid(sc, uid, &udata);
#if defined(L9P_DEBUG)
                if (pwd == NULL)
                        L9P_LOG(L9P_DEBUG,
                            "Tattach: uid %ld: no such user", (long)uid);
#endif
        } else {
                uid = (uid_t)-1;
#if defined(WITH_CASPER)
                pwd = cap_getpwnam(sc->fs_cappwd, req->lr_req.tattach.uname);
#else
                pwd = getpwnam(req->lr_req.tattach.uname);
#endif
#if defined(L9P_DEBUG)
                if (pwd == NULL)
                        L9P_LOG(L9P_DEBUG,
                            "Tattach: %s: no such user",
                            req->lr_req.tattach.uname);
#endif
        }

        /*
         * If caller didn't give a numeric UID, pick it up from pwd
         * if possible.  If that doesn't work we can't continue.
         *
         * Note that pwd also supplies the group set.  This assumes
         * the server has the right mapping; this needs improvement.
         * We do at least support ai->ai_ngids==0 properly now though.
         */
        if (uid == (uid_t)-1 && pwd != NULL)
                uid = pwd->pw_uid;
        if (uid == (uid_t)-1)
                error = EPERM;
        else {
                error = 0;
                if (fstat(sc->fs_rootfd, &st) != 0)
                        error = errno;
                else if (!S_ISDIR(st.st_mode))
                        error = ENOTDIR;
        }
        if (error) {
                (void) pthread_mutex_unlock(&fs_attach_mutex);
                L9P_LOG(L9P_DEBUG,
                    "Tattach: denying uid=%ld access to rootdir: %s",
                    (long)uid, strerror(error));
                /*
                 * Pass ENOENT and ENOTDIR through for diagnosis;
                 * others become EPERM.  This should not leak too
                 * much security.
                 */
                return (error == ENOENT || error == ENOTDIR ? error : EPERM);
        }

        if (pwd != NULL) {
                /*
                 * This either succeeds and fills in ngroups and
                 * returns non-NULL, or fails and sets ngroups to 0
                 * and returns NULL.  Either way ngroups is correct.
                 */
                gids = l9p_getgrlist(pwd->pw_name, pwd->pw_gid, &ngroups);
        } else {
                gids = NULL;
                ngroups = 0;
        }

        /*
         * Done with pwd and group related items that may use
         * non-reentrant C library routines; allow other threads in.
         */
        (void) pthread_mutex_unlock(&fs_attach_mutex);

        ai = malloc(sizeof(*ai) + (size_t)ngroups * sizeof(gid_t));
        if (ai == NULL) {
                free(gids);
                return (ENOMEM);
        }
#ifdef __illumos__
        error = pthread_mutex_init(&ai->ai_mtx, &fs_mutexattr);
#else
        error = pthread_mutex_init(&ai->ai_mtx, NULL);
#endif
        if (error) {
                free(gids);
                free(ai);
                return (error);
        }
        ai->ai_refcnt = 0;
        ai->ai_uid = uid;
        ai->ai_flags = 0;       /* XXX for now */
        ai->ai_ngids = ngroups;
        memcpy(ai->ai_gids, gids, (size_t)ngroups * sizeof(gid_t));
        free(gids);

        file = open_fid(sc->fs_rootfd, ".", ai, true);
        if (file == NULL) {
                (void) pthread_mutex_destroy(&ai->ai_mtx);
                free(ai);
                return (ENOMEM);
        }

        req->lr_fid->lo_aux = file;
        generate_qid(&st, &req->lr_resp.rattach.qid);
        return (0);
}

static int
fs_clunk(void *softc __unused, struct l9p_fid *fid)
{
        struct fs_fid *file;

        file = fid->lo_aux;
        assert(file != NULL);

        if (file->ff_dir) {
                closedir(file->ff_dir);
                file->ff_dir = NULL;
        } else if (file->ff_fd != -1) {
                close(file->ff_fd);
                file->ff_fd = -1;
        }

        return (0);
}

/*
 * Create ops.
 *
 * We are to create a new file under some existing path,
 * where the new file's name is in the Tcreate request and the
 * existing path is due to a fid-based file (req->lr_fid).
 *
 * One op (create regular file) sets file->fd, the rest do not.
 */
static int
fs_create(void *softc, struct l9p_request *req)
{
        struct l9p_fid *dir;
        struct stat st;
        uint32_t dmperm;
        mode_t perm;
        char *name;
        int error;

        dir = req->lr_fid;
        name = req->lr_req.tcreate.name;
        dmperm = req->lr_req.tcreate.perm;
        perm = (mode_t)(dmperm & 0777);

        if (dmperm & L9P_DMDIR)
                error = fs_imkdir(softc, dir, name, true,
                    perm, (gid_t)-1, &st);
        else if (dmperm & L9P_DMSYMLINK)
                error = fs_isymlink(softc, dir, name,
                    req->lr_req.tcreate.extension, (gid_t)-1, &st);
        else if (dmperm & L9P_DMNAMEDPIPE)
                error = fs_imkfifo(softc, dir, name, true,
                    perm, (gid_t)-1, &st);
        else if (dmperm & L9P_DMSOCKET)
                error = fs_imksocket(softc, dir, name, true,
                    perm, (gid_t)-1, &st);
        else if (dmperm & L9P_DMDEVICE) {
                unsigned int major, minor;
                char type;
                dev_t dev;

                /*
                 * ??? Should this be testing < 3?  For now, allow a single
                 * integer mode with minor==0 implied.
                 */
                minor = 0;
                if (sscanf(req->lr_req.tcreate.extension, "%c %u %u",
                    &type, &major, &minor) < 2) {
                        return (EINVAL);
                }

                switch (type) {
                case 'b':
                        perm |= S_IFBLK;
                        break;
                case 'c':
                        perm |= S_IFCHR;
                        break;
                default:
                        return (EINVAL);
                }
                dev = makedev(major, minor);
                error = fs_imknod(softc, dir, name, true, perm, dev,
                    (gid_t)-1, &st);
        } else {
                enum l9p_omode p9;
                int flags;

                p9 = req->lr_req.tcreate.mode;
                error = fs_oflags_dotu(p9, &flags);
                if (error)
                        return (error);
                error = fs_icreate(softc, dir, name, flags,
                    true, perm, (gid_t)-1, &st);
                req->lr_resp.rcreate.iounit = req->lr_conn->lc_max_io_size;
        }

        if (error == 0)
                generate_qid(&st, &req->lr_resp.rcreate.qid);

        return (error);
}

/*
 * https://swtch.com/plan9port/man/man9/open.html and
 * http://plan9.bell-labs.com/magic/man2html/5/open
 * say that permissions are actually
 *     perm & (~0666 | (dir.perm & 0666))
 * for files, and
 *     perm & (~0777 | (dir.perm & 0777))
 * for directories.  That is, the parent directory may
 * take away permissions granted by the operation.
 *
 * This seems a bit restrictive; probably
 * there should be a control knob for this.
 */
static inline mode_t
fs_p9perm(mode_t perm, mode_t dir_perm, bool isdir)
{

        if (isdir)
                perm &= ~0777 | (dir_perm & 0777);
        else
                perm &= ~0666 | (dir_perm & 0666);
        return (perm);
}

/*
 * Internal form of create (plain file).
 *
 * Our caller takes care of splitting off all the special
 * types of create (mknod, etc), so this is purely for files.
 * We receive the fs_softc <softc>, the directory fid <dir>
 * in which the new file is to be created, the name of the
 * new file, a flag <isp9> indicating whether to do plan9 style
 * permissions or Linux style permissions, the permissions <perm>,
 * an effective group id <egid>, and a pointer to a stat structure
 * <st> to fill in describing the final result on success.
 *
 * On successful create, the fid switches to the newly created
 * file, which is now open; its associated file-name changes too.
 *
 * Note that the original (dir) fid is never currently open,
 * so there is nothing to close.
 */
static int
fs_icreate(void *softc, struct l9p_fid *dir, char *name, int flags,
    bool isp9, mode_t perm, gid_t egid, struct stat *st)
{
        struct fs_fid *file;
        gid_t gid;
        uid_t uid;
        char newname[MAXPATHLEN];
        int error, fd;

        file = dir->lo_aux;

        /*
         * Build full path name from directory + file name.  We'll
         * check permissions on the parent directory, then race to
         * create the file before anything bad happens like symlinks.
         *
         * (To close this race we need to use openat(), which is
         * left for a later version of this code.)
         */
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        /* In case of success, we will need a new file->ff_name. */
        name = strdup(newname);
        if (name == NULL)
                return (ENOMEM);

        /* Check create permission and compute new file ownership. */
        error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
        if (error) {
                free(name);
                return (error);
        }

        /* Adjust new-file permissions for Plan9 protocol. */
        if (isp9)
                perm = fs_p9perm(perm, st->st_mode, false);

        /* Create is always exclusive so O_TRUNC is irrelevant. */
        fd = openat(file->ff_dirfd, newname, flags | O_CREAT | O_EXCL, perm);
        if (fd < 0) {
                error = errno;
                free(name);
                return (error);
        }

        /* Fix permissions and owner. */
        if (fchmod(fd, perm) != 0 ||
            fchown(fd, uid, gid) != 0 ||
            fstat(fd, st) != 0) {
                error = errno;
                (void) close(fd);
                /* unlink(newname); ? */
                free(name);
                return (error);
        }

        /* It *was* a directory; now it's a file, and it's open. */
        free(file->ff_name);
        file->ff_name = name;
        file->ff_fd = fd;
        return (0);
}

/*
 * Internal form of open: stat file and verify permissions (from p9
 * argument), then open the file-or-directory, leaving the internal
 * fs_fid fields set up.  If we cannot open the file, return a
 * suitable error number, and leave everything unchanged.
 *
 * To mitigate the race between permissions testing and the actual
 * open, we can stat the file twice (once with lstat() before open,
 * then with fstat() after).  We assume O_NOFOLLOW is set in flags,
 * so if some other race-winner substitutes in a symlink we won't
 * open it here.  (However, embedded symlinks, if they occur, are
 * still an issue.  Ideally we would like to have an O_NEVERFOLLOW
 * that fails on embedded symlinks, and a way to pass this to
 * lstat() as well.)
 *
 * When we use opendir() we cannot pass O_NOFOLLOW, so we must rely
 * on substitution-detection via fstat().  To simplify the code we
 * just always re-check.
 *
 * (For a proper fix in the future, we can require openat(), keep
 * each parent directory open during walk etc, and allow only final
 * name components with O_NOFOLLOW.)
 *
 * On successful return, st has been filled in.
 */
static int
fs_iopen(void *softc, struct l9p_fid *fid, int flags, enum l9p_omode p9,
    gid_t egid __unused, struct stat *st)
{
        struct fs_softc *sc = softc;
        struct fs_fid *file;
        struct stat first;
        int32_t op;
        char *name;
        int error;
        int fd;
        DIR *dirp;

        /* Forbid write ops on read-only file system. */
        if (sc->fs_readonly) {
                if ((flags & O_TRUNC) != 0)
                        return (EROFS);
                if ((flags & O_ACCMODE) != O_RDONLY)
                        return (EROFS);
                if (p9 & L9P_ORCLOSE)
                        return (EROFS);
        }

        file = fid->lo_aux;
        assert(file != NULL);
        name = file->ff_name;

        if (fstatat(file->ff_dirfd, name, &first, AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);
        if (S_ISLNK(first.st_mode))
                return (EPERM);

        /* Can we rely on O_APPEND here?  Best not, can be cleared. */
        switch (flags & O_ACCMODE) {
        case O_RDONLY:
                op = L9P_ACE_READ_DATA;
                break;
        case O_WRONLY:
                op = L9P_ACE_WRITE_DATA;
                break;
        case O_RDWR:
                op = L9P_ACE_READ_DATA | L9P_ACE_WRITE_DATA;
                break;
        default:
                return (EINVAL);
        }
        fillacl(file);
        error = check_access(op, NULL, NULL, file->ff_acl, &first,
            file->ff_ai, (gid_t)-1);
        if (error)
                return (error);

        if (S_ISDIR(first.st_mode)) {
                /* Forbid write or truncate on directory. */
                if ((flags & O_ACCMODE) != O_RDONLY || (flags & O_TRUNC))
                        return (EPERM);
                fd = openat(file->ff_dirfd, name, O_DIRECTORY);
                dirp = fdopendir(fd);
                if (dirp == NULL)
                        return (EPERM);
                fd = dirfd(dirp);
        } else {
                dirp = NULL;
                fd = openat(file->ff_dirfd, name, flags);
                if (fd < 0)
                        return (EPERM);
        }

        /*
         * We have a valid fd, and maybe non-null dirp.  Re-check
         * the file, and fail if st_dev or st_ino changed.
         */
        if (fstat(fd, st) != 0 ||
            first.st_dev != st->st_dev ||
            first.st_ino != st->st_ino) {
                if (dirp != NULL)
                        (void) closedir(dirp);
                else
                        (void) close(fd);
                return (EPERM);
        }
        if (dirp != NULL)
                file->ff_dir = dirp;
        else
                file->ff_fd = fd;
        return (0);
}

/*
 * Internal form of mkdir (common code for all forms).
 * We receive the fs_softc <softc>, the directory fid <dir>
 * in which the new entry is to be created, the name of the
 * new entry, a flag <isp9> indicating whether to do plan9 style
 * permissions or Linux style permissions, the permissions <perm>,
 * an effective group id <egid>, and a pointer to a stat structure
 * <st> to fill in describing the final result on success.
 *
 * See also fs_icreate() above.
 */
static int
fs_imkdir(void *softc, struct l9p_fid *dir, char *name,
    bool isp9, mode_t perm, gid_t egid, struct stat *st)
{
        struct fs_fid *ff;
        gid_t gid;
        uid_t uid;
        char newname[MAXPATHLEN];
        int error, fd;

        ff = dir->lo_aux;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        error = fs_nde(softc, dir, true, egid, st, &uid, &gid);
        if (error)
                return (error);

        if (isp9)
                perm = fs_p9perm(perm, st->st_mode, true);

        if (mkdirat(ff->ff_dirfd, newname, perm) != 0)
                return (errno);

        fd = openat(ff->ff_dirfd, newname,
            O_DIRECTORY | O_RDONLY | O_NOFOLLOW);
        if (fd < 0 ||
            fchown(fd, uid, gid) != 0 ||
            fchmod(fd, perm) != 0 ||
            fstat(fd, st) != 0) {
                error = errno;
                /* rmdir(newname) ? */
        }
        if (fd >= 0)
                (void) close(fd);

        return (error);
}

#ifdef __APPLE__
/*
 * This is an undocumented OS X syscall. It would be best to avoid it,
 * but there doesn't seem to be another safe way to implement mknodat.
 * Dear Apple, please implement mknodat before you remove this syscall.
 */
static int fs_ifchdir_thread_local(int fd)
{
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wdeprecated-declarations"
        return syscall(SYS___pthread_fchdir, fd);
#pragma clang diagnostic pop
}
#endif

/*
 * Internal form of mknod (special device).
 *
 * The device type (S_IFBLK, S_IFCHR) is included in the <mode> parameter.
 */
static int
fs_imknod(void *softc, struct l9p_fid *dir, char *name,
    bool isp9, mode_t mode, dev_t dev, gid_t egid, struct stat *st)
{
        struct fs_fid *ff;
        mode_t perm;
        gid_t gid;
        uid_t uid;
        char newname[MAXPATHLEN];
        int error;

        ff = dir->lo_aux;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
        if (error)
                return (error);

        if (isp9) {
                perm = fs_p9perm(mode & 0777, st->st_mode, false);
                mode = (mode & ~0777) | perm;
        } else {
                perm = mode & 0777;
        }

#ifdef __APPLE__
        if (fs_ifchdir_thread_local(ff->ff_dirfd) < 0) {
                return -1;
        }
        error = mknod(newname, mode, dev);
        int preserved_errno = errno;
        /* Stop using the thread-local cwd */
        fs_ifchdir_thread_local(-1);
        if (error < 0) {
                errno = preserved_errno;
                return errno;
        }
#else
        if (mknodat(ff->ff_dirfd, newname, mode, dev) != 0)
                return (errno);
#endif

        /* We cannot open the new name; race to use l* syscalls. */
        if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
            fchmodat(ff->ff_dirfd, newname, perm, 0) != 0 ||
            fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
                error = errno;
        else if ((st->st_mode & S_IFMT) != (mode & S_IFMT))
                error = EPERM;          /* ??? lost a race anyway */

        /* if (error) unlink(newname) ? */

        return (error);
}

/*
 * Internal form of mkfifo.
 */
static int
fs_imkfifo(void *softc, struct l9p_fid *dir, char *name,
    bool isp9, mode_t perm, gid_t egid, struct stat *st)
{
        struct fs_fid *ff;
        gid_t gid;
        uid_t uid;
        char newname[MAXPATHLEN];
        int error;

        ff = dir->lo_aux;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
        if (error)
                return (error);

        if (isp9)
                perm = fs_p9perm(perm, st->st_mode, false);

        if (mkfifo(newname, perm) != 0)
                return (errno);

        /* We cannot open the new name; race to use l* syscalls. */
        if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
            fchmodat(ff->ff_dirfd, newname, perm, 0) != 0 ||
            fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
                error = errno;
        else if (!S_ISFIFO(st->st_mode))
                error = EPERM;          /* ??? lost a race anyway */

        /* if (error) unlink(newname) ? */

        return (error);
}

/*
 * Internal form of mksocket.
 *
 * This is a bit different because of the horrible socket naming
 * system (bind() with sockaddr_un sun_path).
 */
static int
fs_imksocket(void *softc, struct l9p_fid *dir, char *name,
    bool isp9, mode_t perm, gid_t egid, struct stat *st)
{
        struct fs_fid *ff;
        struct sockaddr_un un;
        char *path;
        char newname[MAXPATHLEN];
        gid_t gid;
        uid_t uid;
        int error = 0, s, fd, slen;

        ff = dir->lo_aux;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
        if (error)
                return (error);

        if (isp9)
                perm = fs_p9perm(perm, st->st_mode, false);

        s = socket(AF_UNIX, SOCK_STREAM, 0);
        if (s < 0)
                return (errno);

        path = newname;
        fd = -1;
#ifdef HAVE_BINDAT
        /* Try bindat() if needed. */
        if (strlen(path) >= sizeof(un.sun_path)) {
                fd = openat(ff->ff_dirfd, ff->ff_name,
                    O_RDONLY | O_DIRECTORY | O_NOFOLLOW);
                if (fd >= 0)
                        path = name;
        }
#endif

        /*
         * Can only create the socket if the path will fit.
         * Even if we are using bindat() there are limits
         * (the API for AF_UNIX sockets is ... not good).
         *
         * Note: in theory we can fill sun_path to the end
         * (omitting a terminating '\0') but in at least one
         * Unix-like system, this was known to behave oddly,
         * so we test for ">=" rather than just ">".
         */
        if (strlen(path) >= sizeof(un.sun_path)) {
                error = ENAMETOOLONG;
                goto out;
        }
        un.sun_family = AF_UNIX;
#ifndef __illumos__
        slen = un.sun_len = sizeof(struct sockaddr_un);
#else
        slen = SUN_LEN(&un);
#endif

        strncpy(un.sun_path, path, sizeof(un.sun_path));

#ifdef HAVE_BINDAT
        if (fd >= 0) {
                if (bindat(fd, s, (struct sockaddr *)&un, slen) < 0)
                        error = errno;
                goto out;       /* done now, for good or ill */
        }
#endif

        if (bind(s, (struct sockaddr *)&un, slen) < 0)
                error = errno;
out:

        if (error == 0) {
                /*
                 * We believe we created the socket-inode.  Fix
                 * permissions etc.  Note that we cannot use
                 * fstat() on the socket descriptor: it succeeds,
                 * but we get bogus data!
                 */
                if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
                    fchmodat(ff->ff_dirfd, newname, perm, 0) != 0 ||
                    fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
                        error = errno;
                else if (!S_ISSOCK(st->st_mode))
                        error = EPERM;          /* ??? lost a race anyway */

                /* if (error) unlink(newname) ? */
        }

        /*
         * It's not clear which error should override, although
         * ideally we should never see either close() call fail.
         * In any case we do want to try to close both fd and s,
         * always.  Let's set error only if it is not already set,
         * so that all exit paths can use the same code.
         */
        if (fd >= 0 && close(fd) != 0)
                if (error == 0)
                        error = errno;
        if (close(s) != 0)
                if (error == 0)
                        error = errno;

        return (error);
}

/*
 * Internal form of symlink.
 *
 * Note that symlinks are presumed to carry no permission bits.
 * They do have owners, however (who may be charged for quotas).
 */
static int
fs_isymlink(void *softc, struct l9p_fid *dir, char *name,
    char *symtgt, gid_t egid, struct stat *st)
{
        struct fs_fid *ff;
        gid_t gid;
        uid_t uid;
        char newname[MAXPATHLEN];
        int error;

        ff = dir->lo_aux;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
        if (error)
                return (error);

        if (symlinkat(symtgt, ff->ff_dirfd, newname) != 0)
                return (errno);

        /* We cannot open the new name; race to use l* syscalls. */
        if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
            fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
                error = errno;
        else if (!S_ISLNK(st->st_mode))
                error = EPERM;          /* ??? lost a race anyway */

        /* if (error) unlink(newname) ? */

        return (error);
}

static int
fs_open(void *softc, struct l9p_request *req)
{
        struct l9p_fid *fid = req->lr_fid;
        struct stat st;
        enum l9p_omode p9;
        int error, flags;

        p9 = req->lr_req.topen.mode;
        error = fs_oflags_dotu(p9, &flags);
        if (error)
                return (error);

        error = fs_iopen(softc, fid, flags, p9, (gid_t)-1, &st);
        if (error)
                return (error);

        generate_qid(&st, &req->lr_resp.ropen.qid);
        req->lr_resp.ropen.iounit = req->lr_conn->lc_max_io_size;
        return (0);
}

/*
 * Helper for directory read.  We want to run an lstat on each
 * file name within the directory.  This is a lot faster if we
 * have lstatat (or fstatat with AT_SYMLINK_NOFOLLOW), but not
 * all systems do, so hide the ifdef-ed code in an inline function.
 */
static inline int
fs_lstatat(struct fs_fid *file, char *name, struct stat *st)
{

        return (fstatat(dirfd(file->ff_dir), name, st, AT_SYMLINK_NOFOLLOW));
}

static int
fs_read(void *softc, struct l9p_request *req)
{
        struct l9p_stat l9stat;
        struct fs_softc *sc;
        struct fs_fid *file;
        bool dotu = req->lr_conn->lc_version >= L9P_2000U;
        ssize_t ret;

        sc = softc;
        file = req->lr_fid->lo_aux;
        assert(file != NULL);

        if (file->ff_dir != NULL) {
                struct dirent *d;
                struct stat st;
                struct l9p_message msg;
                long o;
                int err;

                if ((err = pthread_mutex_lock(&file->ff_mtx)) != 0)
                        return (err);

                /*
                 * Must use telldir before readdir since seekdir
                 * takes cookie values.  Unfortunately this wastes
                 * a lot of time (and memory) building unneeded
                 * cookies that can only be flushed by closing
                 * the directory.
                 *
                 * NB: FreeBSD libc seekdir has SINGLEUSE defined,
                 * so in fact, we can discard the cookies by
                 * calling seekdir on them.  This clears up wasted
                 * memory at the cost of even more wasted time...
                 *
                 * XXX: readdir/telldir/seekdir not thread safe
                 */
                l9p_init_msg(&msg, req, L9P_PACK);
                for (;;) {
                        o = telldir(file->ff_dir);
                        d = readdir(file->ff_dir);
                        if (d == NULL)
                                break;
                        if (fs_lstatat(file, d->d_name, &st))
                                continue;
                        dostat(sc, &l9stat, d->d_name, &st, dotu);
                        if (l9p_pack_stat(&msg, req, &l9stat) != 0) {
                                seekdir(file->ff_dir, o);
                                break;
                        }
#if defined(__FreeBSD__)
                        seekdir(file->ff_dir, o);
                        (void) readdir(file->ff_dir);
#endif
                }

                (void) pthread_mutex_unlock(&file->ff_mtx);
        } else {
                size_t niov = l9p_truncate_iov(req->lr_data_iov,
                    req->lr_data_niov, req->lr_req.io.count);

#if defined(__FreeBSD__) || defined(__illumos__)
                ret = preadv(file->ff_fd, req->lr_data_iov, niov,
                    req->lr_req.io.offset);
#else
                /* XXX: not thread safe, should really use aio_listio. */
                if (lseek(file->ff_fd, (off_t)req->lr_req.io.offset, SEEK_SET) < 0)
                        return (errno);

                ret = (uint32_t)readv(file->ff_fd, req->lr_data_iov, (int)niov);
#endif

                if (ret < 0)
                        return (errno);

                req->lr_resp.io.count = (uint32_t)ret;
        }

        return (0);
}

static int
fs_remove(void *softc, struct l9p_fid *fid)
{
        struct fs_softc *sc = softc;
        struct l9p_acl *parent_acl;
        struct fs_fid *file;
        struct stat pst, cst;
        char dirname[MAXPATHLEN];
        int error;

        if (sc->fs_readonly)
                return (EROFS);

        error = fs_pdir(sc, fid, dirname, sizeof(dirname), &pst);
        if (error)
                return (error);

        file = fid->lo_aux;
        if (fstatat(file->ff_dirfd, file->ff_name, &cst, AT_SYMLINK_NOFOLLOW) != 0)
                return (error);

        parent_acl = getacl(file, -1, dirname);
        fillacl(file);

        error = check_access(L9P_ACOP_UNLINK,
            parent_acl, &pst, file->ff_acl, &cst, file->ff_ai, (gid_t)-1);
        l9p_acl_free(parent_acl);
        if (error)
                return (error);

        if (unlinkat(file->ff_dirfd, file->ff_name,
            S_ISDIR(cst.st_mode) ? AT_REMOVEDIR : 0) != 0) {
                error = errno;
                if (error == EEXIST && S_ISDIR(cst.st_mode))
                        error = ENOTEMPTY;
        }

        return (error);
}

static int
fs_stat(void *softc, struct l9p_request *req)
{
        struct fs_softc *sc;
        struct fs_fid *file;
        struct stat st;
        bool dotu = req->lr_conn->lc_version >= L9P_2000U;

        sc = softc;
        file = req->lr_fid->lo_aux;
        assert(file);

        if (fstatat(file->ff_dirfd, file->ff_name, &st,
            AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);

        dostat(sc, &req->lr_resp.rstat.stat, file->ff_name, &st, dotu);
        return (0);
}

static int
fs_walk(void *softc, struct l9p_request *req)
{
        struct l9p_acl *acl;
        struct fs_authinfo *ai;
        struct fs_fid *file = req->lr_fid->lo_aux;
        struct fs_fid *newfile;
        struct stat st;
        size_t clen, namelen, need;
        char *comp, *succ, *next, *swtmp;
        bool atroot;
        bool dotdot;
        int i, nwname;
        int error = 0;
        char namebufs[2][MAXPATHLEN];

        /*
         * https://swtch.com/plan9port/man/man9/walk.html:
         *
         *    It is legal for nwname to be zero, in which case newfid
         *    will represent the same file as fid and the walk will
         *    usually succeed; this is equivalent to walking to dot.
         * [Aside: it's not clear if we should test S_ISDIR here.]
         *    ...
         *    The name ".." ... represents the parent directory.
         *    The name "." ... is not used in the protocol.
         *    ... A walk of the name ".." in the root directory
         *    of the server is equivalent to a walk with no name
         *    elements.
         *
         * Note that req.twalk.nwname never exceeds L9P_MAX_WELEM,
         * so it is safe to convert to plain int.
         *
         * We are to return an error only if the first walk fails,
         * else stop at the end of the names or on the first error.
         * The final fid is based on the last name successfully
         * walked.
         *
         * Note that we *do* get Twalk requests with nwname==0 on files.
         *
         * Set up "successful name" buffer pointer with base fid name,
         * initially.  We'll swap each new success into it as we go.
         *
         * Invariant: atroot and stat data correspond to current
         * (succ) path.
         */
        succ = namebufs[0];
        next = namebufs[1];
        namelen = strlcpy(succ, file->ff_name, MAXPATHLEN);
        if (namelen >= MAXPATHLEN)
                return (ENAMETOOLONG);
        if (fstatat(file->ff_dirfd, succ, &st, AT_SYMLINK_NOFOLLOW) < 0)
                return (errno);
        ai = file->ff_ai;
        atroot = strlen(succ) == 0; /* XXX? */
        fillacl(file);
        acl = file->ff_acl;

        nwname = (int)req->lr_req.twalk.nwname;

        for (i = 0; i < nwname; i++) {
                /*
                 * Must have execute permission to search a directory.
                 * Then, look up each component in its directory-so-far.
                 * Check for ".." along the way, handlng specially
                 * as needed.  Forbid "/" in name components.
                 *
                 */
                if (!S_ISDIR(st.st_mode)) {
                        error = ENOTDIR;
                        goto out;
                }
                error = check_access(L9P_ACE_EXECUTE,
                     NULL, NULL, acl, &st, ai, (gid_t)-1);
                if (error) {
                        L9P_LOG(L9P_DEBUG,
                            "Twalk: denying dir-walk on \"%s\" for uid %u",
                            succ, (unsigned)ai->ai_uid);
                        error = EPERM;
                        goto out;
                }
                comp = req->lr_req.twalk.wname[i];
                if (strchr(comp, '/') != NULL) {
                        error = EINVAL;
                        break;
                }

                clen = strlen(comp);
                dotdot = false;

                /*
                 * Build next pathname (into "next").  If "..",
                 * just strip one name component off the success
                 * name so far.  Since we know this name fits, the
                 * stripped down version also fits.  Otherwise,
                 * the name is the base name plus '/' plus the
                 * component name plus terminating '\0'; this may
                 * or may not fit.
                 */
                if (comp[0] == '.') {
                        if (clen == 1) {
                                error = EINVAL;
                                break;
                        }
                        if (comp[1] == '.' && clen == 2)
                                dotdot = true;
                }
                if (dotdot) {
                        /*
                         * It's not clear how ".." at root should
                         * be handled when i > 0.  Obeying the man
                         * page exactly, we reset i to 0 and stop,
                         * declaring terminal success.
                         *
                         * Otherwise, we just climbed up one level
                         * so adjust "atroot".
                         */
                        if (atroot) {
                                i = 0;
                                break;
                        }
                        (void) r_dirname(succ, next, MAXPATHLEN);
                        namelen = strlen(next);
                        atroot = strlen(next) == 0; /* XXX? */
                } else {
                        need = namelen + 1 + clen + 1;
                        if (need > MAXPATHLEN) {
                                error = ENAMETOOLONG;
                                break;
                        }
                        memcpy(next, succ, namelen);
                        next[namelen++] = '/';
                        memcpy(&next[namelen], comp, clen + 1);
                        namelen += clen;
                        /*
                         * Since name is never ".", we are necessarily
                         * descending below the root now.
                         */
                        atroot = false;
                }

                if (fstatat(file->ff_dirfd, next, &st, AT_SYMLINK_NOFOLLOW) < 0) {
                        error = ENOENT;
                        break;
                }

                /*
                 * Success: generate qid and swap this
                 * successful name into place.  Update acl.
                 */
                generate_qid(&st, &req->lr_resp.rwalk.wqid[i]);
                swtmp = succ;
                succ = next;
                next = swtmp;
                if (acl != NULL && acl != file->ff_acl)
                        l9p_acl_free(acl);
                acl = getacl(file, -1, next);
        }

        /*
         * Fail only if we failed on the first name.
         * Otherwise we succeeded on something, and "succ"
         * points to the last successful name in namebufs[].
         */
        if (error) {
                if (i == 0)
                        goto out;
                error = 0;
        }

        newfile = open_fid(file->ff_dirfd, succ, ai, false);
        if (newfile == NULL) {
                error = ENOMEM;
                goto out;
        }
        if (req->lr_newfid == req->lr_fid) {
                /*
                 * Before overwriting fid->lo_aux, free the old value.
                 * Note that this doesn't free the l9p_fid data,
                 * just the fs_fid data.  (But it does ditch ff_acl.)
                 */
                if (acl == file->ff_acl)
                        acl = NULL;
                fs_freefid(softc, req->lr_fid);
                file = NULL;
        }
        req->lr_newfid->lo_aux = newfile;
        if (file != NULL && acl != file->ff_acl) {
                newfile->ff_acl = acl;
                acl = NULL;
        }
        req->lr_resp.rwalk.nwqid = (uint16_t)i;
out:
        if (file != NULL && acl != file->ff_acl)
                l9p_acl_free(acl);
        return (error);
}

static int
fs_write(void *softc, struct l9p_request *req)
{
        struct fs_softc *sc = softc;
        struct fs_fid *file;
        ssize_t ret;

        file = req->lr_fid->lo_aux;
        assert(file != NULL);

        if (sc->fs_readonly)
                return (EROFS);

        size_t niov = l9p_truncate_iov(req->lr_data_iov,
            req->lr_data_niov, req->lr_req.io.count);

#if defined(__FreeBSD__) || defined(__illumos__)
        ret = pwritev(file->ff_fd, req->lr_data_iov, niov,
            req->lr_req.io.offset);
#else
        /* XXX: not thread safe, should really use aio_listio. */
        if (lseek(file->ff_fd, (off_t)req->lr_req.io.offset, SEEK_SET) < 0)
                return (errno);

        ret = writev(file->ff_fd, req->lr_data_iov,
            (int)niov);
#endif

        if (ret < 0)
                return (errno);

        req->lr_resp.io.count = (uint32_t)ret;
        return (0);
}

static int
fs_wstat(void *softc, struct l9p_request *req)
{
        struct fs_softc *sc = softc;
        struct l9p_stat *l9stat = &req->lr_req.twstat.stat;
        struct l9p_fid *fid;
        struct fs_fid *file;
        int error = 0;

        fid = req->lr_fid;
        file = fid->lo_aux;
        assert(file != NULL);

        /*
         * XXX:
         *
         * stat(9P) sez:
         *
         * Either all the changes in wstat request happen, or none of them
         * does: if the request succeeds, all changes were made; if it fails,
         * none were.
         *
         * Atomicity is clearly missing in current implementation.
         */

        if (sc->fs_readonly)
                return (EROFS);

        if (l9stat->atime != (uint32_t)~0) {
                /* XXX: not implemented, ignore */
        }

        if (l9stat->mtime != (uint32_t)~0) {
                /* XXX: not implemented, ignore */
        }

        if (l9stat->dev != (uint32_t)~0) {
                error = EPERM;
                goto out;
        }

        if (l9stat->length != (uint64_t)~0) {
                if (file->ff_dir != NULL) {
                        error = EINVAL;
                        goto out;
                }

                if (truncate(file->ff_name, (off_t)l9stat->length) != 0) {
                        error = errno;
                        goto out;
                }
        }

        if (req->lr_conn->lc_version >= L9P_2000U) {
                if (fchownat(file->ff_dirfd, file->ff_name, l9stat->n_uid,
                    l9stat->n_gid, AT_SYMLINK_NOFOLLOW) != 0) {
                        error = errno;
                        goto out;
                }
        }

        if (l9stat->mode != (uint32_t)~0) {
                if (fchmodat(file->ff_dirfd, file->ff_name,
                    l9stat->mode & 0777, 0) != 0) {
                        error = errno;
                        goto out;
                }
        }

        if (strlen(l9stat->name) > 0) {
                struct l9p_acl *parent_acl;
                struct stat st;
                char *tmp;
                char newname[MAXPATHLEN];

                /*
                 * Rename-within-directory: it's not deleting anything,
                 * but we need write permission on the directory.  This
                 * should suffice.
                 */
                error = fs_pdir(softc, fid, newname, sizeof(newname), &st);
                if (error)
                        goto out;
                parent_acl = getacl(file, -1, newname);
                error = check_access(L9P_ACE_ADD_FILE,
                    parent_acl, &st, NULL, NULL, file->ff_ai, (gid_t)-1);
                l9p_acl_free(parent_acl);
                if (error)
                        goto out;
                error = fs_dpf(newname, l9stat->name, sizeof(newname));
                if (error)
                        goto out;
                tmp = strdup(newname);
                if (tmp == NULL) {
                        error = ENOMEM;
                        goto out;
                }
                if (renameat(file->ff_dirfd, file->ff_name, file->ff_dirfd,
                    tmp) != 0) {
                        error = errno;
                        free(tmp);
                        goto out;
                }
                /* Successful rename, update file->ff_name.  ACL can stay. */
                free(file->ff_name);
                file->ff_name = tmp;
        }
out:
        return (error);
}

static int
fs_statfs(void *softc __unused, struct l9p_request *req)
{
        struct fs_fid *file;
        struct stat st;
#ifdef __illumos__
        struct statvfs f;
#else
        struct statfs f;
#endif
        long name_max;
        int error;
        int fd;

        file = req->lr_fid->lo_aux;
        assert(file);

        if (fstatat(file->ff_dirfd, file->ff_name, &st,
            AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);

        /*
         * Not entirely clear what access to require; we'll go
         * for "read data".
         */
        fillacl(file);
        error = check_access(L9P_ACE_READ_DATA, NULL, NULL,
            file->ff_acl, &st, file->ff_ai, (gid_t)-1);
        if (error)
                return (error);

        fd = openat(file->ff_dirfd, file->ff_name, 0);
        if (fd < 0)
                return (errno);

#ifdef __illumos__
        if (fstatvfs(fd, &f) != 0)
                return (errno);
#else
        if (fstatfs(fd, &f) != 0)
                return (errno);
#endif

        name_max = fpathconf(fd, _PC_NAME_MAX);
        error = errno;
        close(fd);

        if (name_max == -1)
                return (error);

        dostatfs(&req->lr_resp.rstatfs.statfs, &f, name_max);

        return (0);
}

static int
fs_lopen(void *softc, struct l9p_request *req)
{
        struct l9p_fid *fid = req->lr_fid;
        struct stat st;
        enum l9p_omode p9;
        gid_t gid;
        int error, flags;

        error = fs_oflags_dotl(req->lr_req.tlopen.flags, &flags, &p9);
        if (error)
                return (error);

        gid = req->lr_req.tlopen.gid;
        error = fs_iopen(softc, fid, flags, p9, gid, &st);
        if (error)
                return (error);

        generate_qid(&st, &req->lr_resp.rlopen.qid);
        req->lr_resp.rlopen.iounit = req->lr_conn->lc_max_io_size;
        return (0);
}

static int
fs_lcreate(void *softc, struct l9p_request *req)
{
        struct l9p_fid *dir;
        struct stat st;
        enum l9p_omode p9;
        char *name;
        mode_t perm;
        gid_t gid;
        int error, flags;

        dir = req->lr_fid;
        name = req->lr_req.tlcreate.name;

        error = fs_oflags_dotl(req->lr_req.tlcreate.flags, &flags, &p9);
        if (error)
                return (error);

        perm = (mode_t)req->lr_req.tlcreate.mode & 0777; /* ? set-id bits? */
        gid = req->lr_req.tlcreate.gid;
        error = fs_icreate(softc, dir, name, flags, false, perm, gid, &st);
        if (error == 0)
                generate_qid(&st, &req->lr_resp.rlcreate.qid);
        req->lr_resp.rlcreate.iounit = req->lr_conn->lc_max_io_size;
        return (error);
}

static int
fs_symlink(void *softc, struct l9p_request *req)
{
        struct l9p_fid *dir;
        struct stat st;
        gid_t gid;
        char *name, *symtgt;
        int error;

        dir = req->lr_fid;
        name = req->lr_req.tsymlink.name;
        symtgt = req->lr_req.tsymlink.symtgt;
        gid = req->lr_req.tsymlink.gid;
        error = fs_isymlink(softc, dir, name, symtgt, gid, &st);
        if (error == 0)
                generate_qid(&st, &req->lr_resp.rsymlink.qid);
        return (error);
}

static int
fs_mknod(void *softc, struct l9p_request *req)
{
        struct l9p_fid *dir;
        struct stat st;
        uint32_t mode, major, minor;
        dev_t dev;
        gid_t gid;
        char *name;
        int error;

        dir = req->lr_fid;
        name = req->lr_req.tmknod.name;
        mode = req->lr_req.tmknod.mode;
        gid = req->lr_req.tmknod.gid;

        switch (mode & S_IFMT) {
        case S_IFBLK:
        case S_IFCHR:
                mode = (mode & S_IFMT) | (mode & 0777); /* ??? */
                major = req->lr_req.tmknod.major;
                minor = req->lr_req.tmknod.major;
                dev = makedev(major, minor);
                error = fs_imknod(softc, dir, name, false,
                    (mode_t)mode, dev, gid, &st);
                break;

        case S_IFIFO:
                error = fs_imkfifo(softc, dir, name, false,
                    (mode_t)(mode & 0777), gid, &st);
                break;

        case S_IFSOCK:
                error = fs_imksocket(softc, dir, name, false,
                    (mode_t)(mode & 0777), gid, &st);
                break;

        default:
                error = EINVAL;
                break;
        }
        if (error == 0)
                generate_qid(&st, &req->lr_resp.rmknod.qid);
        return (error);
}

static int
fs_rename(void *softc, struct l9p_request *req)
{
        struct fs_softc *sc = softc;
        struct fs_authinfo *ai;
        struct l9p_acl *oparent_acl;
        struct l9p_fid *fid, *f2;
        struct fs_fid *file, *f2ff;
        struct stat cst, opst, npst;
        int32_t op;
        bool reparenting;
        char *tmp;
        char olddir[MAXPATHLEN], newname[MAXPATHLEN];
        int error;

        if (sc->fs_readonly)
                return (EROFS);

        /*
         * Note: lr_fid represents the file that is to be renamed,
         * so we must locate its parent directory and verify that
         * both this parent directory and the new directory f2 are
         * writable.  But if the new parent directory is the same
         * path as the old parent directory, our job is simpler.
         */
        fid = req->lr_fid;
        file = fid->lo_aux;
        assert(file != NULL);
        ai = file->ff_ai;

        error = fs_pdir(sc, fid, olddir, sizeof(olddir), &opst);
        if (error)
                return (error);

        f2 = req->lr_fid2;
        f2ff = f2->lo_aux;
        assert(f2ff != NULL);

        reparenting = strcmp(olddir, f2ff->ff_name) != 0;

        fillacl(file);
        fillacl(f2ff);

        if (fstatat(file->ff_dirfd, file->ff_name, &cst,
            AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);

        /*
         * Are we moving from olddir?  If so, we're unlinking
         * from it, in terms of ACL access.
         */
        if (reparenting) {
                oparent_acl = getacl(file, -1, olddir);
                error = check_access(L9P_ACOP_UNLINK,
                    oparent_acl, &opst, file->ff_acl, &cst, ai, (gid_t)-1);
                l9p_acl_free(oparent_acl);
                if (error)
                        return (error);
        }

        /*
         * Now check that we're allowed to "create" a file or directory in
         * f2.  (Should we do this, too, only if reparenting?  Maybe check
         * for dir write permission if not reparenting -- but that's just
         * add-file/add-subdir, which means doing this always.)
         */
        if (fstatat(f2ff->ff_dirfd, f2ff->ff_name, &npst,
            AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);

        op = S_ISDIR(cst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
        error = check_access(op, f2ff->ff_acl, &npst, NULL, NULL,
            ai, (gid_t)-1);
        if (error)
                return (error);

        /*
         * Directories OK, file systems not R/O, etc; build final name.
         * f2ff->ff_name cannot exceed MAXPATHLEN, but out of general
         * paranoia, let's double check anyway.
         */
        if (strlcpy(newname, f2ff->ff_name, sizeof(newname)) >= sizeof(newname))
                return (ENAMETOOLONG);
        error = fs_dpf(newname, req->lr_req.trename.name, sizeof(newname));
        if (error)
                return (error);
        tmp = strdup(newname);
        if (tmp == NULL)
                return (ENOMEM);

        if (renameat(file->ff_dirfd, file->ff_name, file->ff_dirfd, tmp) != 0) {
                error = errno;
                free(tmp);
                return (error);
        }

        /* file has been renamed but old fid is not clunked */
        free(file->ff_name);
        file->ff_name = tmp;

        dropacl(file);
        return (0);
}

static int
fs_readlink(void *softc __unused, struct l9p_request *req)
{
        struct fs_fid *file;
        ssize_t linklen;
        char buf[MAXPATHLEN];
        int error = 0;

        file = req->lr_fid->lo_aux;
        assert(file);

        linklen = readlinkat(file->ff_dirfd, file->ff_name, buf, sizeof(buf));
        if (linklen < 0)
                error = errno;
        else if ((size_t)linklen >= sizeof(buf))
                error = ENOMEM; /* todo: allocate dynamically */
        else if ((req->lr_resp.rreadlink.target = strndup(buf,
            (size_t)linklen)) == NULL)
                error = ENOMEM;
        return (error);
}

static int
fs_getattr(void *softc __unused, struct l9p_request *req)
{
        uint64_t mask, valid;
        struct fs_fid *file;
        struct stat st;
        int error = 0;

        file = req->lr_fid->lo_aux;
        assert(file);

        valid = 0;
        if (fstatat(file->ff_dirfd, file->ff_name, &st, AT_SYMLINK_NOFOLLOW)) {
                error = errno;
                goto out;
        }
        /* ?? Can we provide items not-requested? If so, can skip tests. */
        mask = req->lr_req.tgetattr.request_mask;
        if (mask & L9PL_GETATTR_MODE) {
                /* It is not clear if we need any translations. */
                req->lr_resp.rgetattr.mode = st.st_mode;
                valid |= L9PL_GETATTR_MODE;
        }
        if (mask & L9PL_GETATTR_NLINK) {
                req->lr_resp.rgetattr.nlink = st.st_nlink;
                valid |= L9PL_GETATTR_NLINK;
        }
        if (mask & L9PL_GETATTR_UID) {
                /* provide st_uid, or file->ff_uid? */
                req->lr_resp.rgetattr.uid = st.st_uid;
                valid |= L9PL_GETATTR_UID;
        }
        if (mask & L9PL_GETATTR_GID) {
                /* provide st_gid, or file->ff_gid? */
                req->lr_resp.rgetattr.gid = st.st_gid;
                valid |= L9PL_GETATTR_GID;
        }
        if (mask & L9PL_GETATTR_RDEV) {
                /* It is not clear if we need any translations. */
                req->lr_resp.rgetattr.rdev = (uint64_t)st.st_rdev;
                valid |= L9PL_GETATTR_RDEV;
        }
        if (mask & L9PL_GETATTR_ATIME) {
                req->lr_resp.rgetattr.atime_sec =
                    (uint64_t)STAT_ATIME(&st).tv_sec;
                req->lr_resp.rgetattr.atime_nsec =
                    (uint64_t)STAT_ATIME(&st).tv_nsec;
                valid |= L9PL_GETATTR_ATIME;
        }
        if (mask & L9PL_GETATTR_MTIME) {
                req->lr_resp.rgetattr.mtime_sec =
                    (uint64_t)STAT_MTIME(&st).tv_sec;
                req->lr_resp.rgetattr.mtime_nsec =
                    (uint64_t)STAT_MTIME(&st).tv_nsec;
                valid |= L9PL_GETATTR_MTIME;
        }
        if (mask & L9PL_GETATTR_CTIME) {
                req->lr_resp.rgetattr.ctime_sec =
                    (uint64_t)STAT_CTIME(&st).tv_sec;
                req->lr_resp.rgetattr.ctime_nsec =
                    (uint64_t)STAT_CTIME(&st).tv_nsec;
                valid |= L9PL_GETATTR_CTIME;
        }
        if (mask & L9PL_GETATTR_BTIME) {
#if defined(HAVE_BIRTHTIME)
                req->lr_resp.rgetattr.btime_sec =
                    (uint64_t)st.st_birthtim.tv_sec;
                req->lr_resp.rgetattr.btime_nsec =
                    (uint64_t)st.st_birthtim.tv_nsec;
#elif defined(__illumos__)
                getcrtime(softc, file->ff_dirfd, file->ff_name,
                    &req->lr_resp.rgetattr.btime_sec,
                    &req->lr_resp.rgetattr.btime_nsec);
#else
                req->lr_resp.rgetattr.btime_sec = 0;
                req->lr_resp.rgetattr.btime_nsec = 0;
#endif
                valid |= L9PL_GETATTR_BTIME;
        }
        if (mask & L9PL_GETATTR_INO)
                valid |= L9PL_GETATTR_INO;
        if (mask & L9PL_GETATTR_SIZE) {
                req->lr_resp.rgetattr.size = (uint64_t)st.st_size;
                valid |= L9PL_GETATTR_SIZE;
        }
        if (mask & L9PL_GETATTR_BLOCKS) {
                req->lr_resp.rgetattr.blksize = (uint64_t)st.st_blksize;
                req->lr_resp.rgetattr.blocks = (uint64_t)st.st_blocks;
                valid |= L9PL_GETATTR_BLOCKS;
        }
#ifndef __illumos__
        if (mask & L9PL_GETATTR_GEN) {
                req->lr_resp.rgetattr.gen = st.st_gen;
                valid |= L9PL_GETATTR_GEN;
        }
#endif
        /* don't know what to do with data version yet */

        generate_qid(&st, &req->lr_resp.rgetattr.qid);
out:
        req->lr_resp.rgetattr.valid = valid;
        return (error);
}

/*
 * Should combine some of this with wstat code.
 */
static int
fs_setattr(void *softc, struct l9p_request *req)
{
        uint64_t mask;
        struct fs_softc *sc = softc;
        struct timespec ts[2];
        struct fs_fid *file;
        struct stat st;
        int error = 0;
        uid_t uid, gid;

        file = req->lr_fid->lo_aux;
        assert(file);

        if (sc->fs_readonly)
                return (EROFS);

        /*
         * As with WSTAT we have atomicity issues.
         */
        mask = req->lr_req.tsetattr.valid;

        if (fstatat(file->ff_dirfd, file->ff_name, &st, AT_SYMLINK_NOFOLLOW)) {
                error = errno;
                goto out;
        }

        if ((mask & L9PL_SETATTR_SIZE) && S_ISDIR(st.st_mode)) {
                error = EISDIR;
                goto out;
        }

        if (mask & L9PL_SETATTR_MODE) {
                if (fchmodat(file->ff_dirfd, file->ff_name,
                    req->lr_req.tsetattr.mode & 0777,
                    0)) {
                        error = errno;
                        goto out;
                }
        }

        if (mask & (L9PL_SETATTR_UID | L9PL_SETATTR_GID)) {
                uid = mask & L9PL_SETATTR_UID
                    ? req->lr_req.tsetattr.uid
                    : (uid_t)-1;

                gid = mask & L9PL_SETATTR_GID
                    ? req->lr_req.tsetattr.gid
                    : (gid_t)-1;

                if (fchownat(file->ff_dirfd, file->ff_name, uid, gid,
                    AT_SYMLINK_NOFOLLOW)) {
                        error = errno;
                        goto out;
                }
        }

        if (mask & L9PL_SETATTR_SIZE) {
                /* Truncate follows symlinks, is this OK? */
                int fd = openat(file->ff_dirfd, file->ff_name, O_RDWR);
                if (ftruncate(fd, (off_t)req->lr_req.tsetattr.size)) {
                        error = errno;
                        (void) close(fd);
                        goto out;
                }
                (void) close(fd);
        }

        if (mask & (L9PL_SETATTR_ATIME | L9PL_SETATTR_MTIME)) {
                ts[0].tv_sec = STAT_ATIME(&st).tv_sec;
                ts[0].tv_nsec = STAT_ATIME(&st).tv_nsec;
                ts[1].tv_sec = STAT_MTIME(&st).tv_sec;
                ts[1].tv_nsec = STAT_MTIME(&st).tv_nsec;

                if (mask & L9PL_SETATTR_ATIME) {
                        if (mask & L9PL_SETATTR_ATIME_SET) {
                                ts[0].tv_sec = req->lr_req.tsetattr.atime_sec;
                                ts[0].tv_nsec = req->lr_req.tsetattr.atime_nsec;
                        } else {
                                if (clock_gettime(CLOCK_REALTIME, &ts[0]) != 0) {
                                        error = errno;
                                        goto out;
                                }
                        }
                }

                if (mask & L9PL_SETATTR_MTIME) {
                        if (mask & L9PL_SETATTR_MTIME_SET) {
                                ts[1].tv_sec = req->lr_req.tsetattr.mtime_sec;
                                ts[1].tv_nsec = req->lr_req.tsetattr.mtime_nsec;
                        } else {
                                if (clock_gettime(CLOCK_REALTIME, &ts[1]) != 0) {
                                        error = errno;
                                        goto out;
                                }
                        }
                }

                if (utimensat(file->ff_dirfd, file->ff_name, ts,
                    AT_SYMLINK_NOFOLLOW)) {
                        error = errno;
                        goto out;
                }
        }
out:
        return (error);
}

static int
fs_xattrwalk(void *softc __unused, struct l9p_request *req __unused)
{
        return (EOPNOTSUPP);
}

static int
fs_xattrcreate(void *softc __unused, struct l9p_request *req __unused)
{
        return (EOPNOTSUPP);
}

static int
fs_readdir(void *softc __unused, struct l9p_request *req)
{
        struct l9p_message msg;
        struct l9p_dirent de;
        struct fs_fid *file;
        struct dirent *dp;
        struct stat st;
        uint32_t count;
        int error = 0;

        file = req->lr_fid->lo_aux;
        assert(file);

        if (file->ff_dir == NULL)
                return (ENOTDIR);

        if ((error = pthread_mutex_lock(&file->ff_mtx)) != 0)
                return (error);

        /*
         * It's not clear whether we can use the same trick for
         * discarding offsets here as we do in fs_read.  It
         * probably should work, we'll have to see if some
         * client(s) use the zero-offset thing to rescan without
         * clunking the directory first.
         *
         * Probably the thing to do is switch to calling
         * getdirentries() / getdents() directly, instead of
         * going through libc.
         */
        if (req->lr_req.io.offset == 0)
                rewinddir(file->ff_dir);
        else
                seekdir(file->ff_dir, (long)req->lr_req.io.offset);

        l9p_init_msg(&msg, req, L9P_PACK);
        count = (uint32_t)msg.lm_size; /* in case we get no entries */
        while ((dp = readdir(file->ff_dir)) != NULL) {
                /*
                 * Although "." is forbidden in naming and ".." is
                 * special cased, testing shows that we must transmit
                 * them through readdir.  (For ".." at root, we
                 * should perhaps alter the inode number, but not
                 * yet.)
                 */

                /*
                 * TODO: we do a full lstat here; could use dp->d_*
                 * to construct the qid more efficiently, as long
                 * as dp->d_type != DT_UNKNOWN.
                 */
                if (fs_lstatat(file, dp->d_name, &st))
                        continue;

                de.qid.type = 0;
                generate_qid(&st, &de.qid);
                de.offset = (uint64_t)telldir(file->ff_dir);
#ifdef __illumos__
                de.type = st.st_mode & S_IFMT;
#else
                de.type = dp->d_type;
#endif
                de.name = dp->d_name;

                /* Update count only if we completely pack the dirent. */
                if (l9p_pudirent(&msg, &de) < 0)
                        break;
                count = (uint32_t)msg.lm_size;
        }

        (void) pthread_mutex_unlock(&file->ff_mtx);
        req->lr_resp.io.count = count;
        return (error);
}

static int
fs_fsync(void *softc __unused, struct l9p_request *req)
{
        struct fs_fid *file;
        int error = 0;

        file = req->lr_fid->lo_aux;
        assert(file);
        if (fsync(file->ff_dir != NULL ? dirfd(file->ff_dir) : file->ff_fd))
                error = errno;
        return (error);
}

static int
fs_lock(void *softc __unused, struct l9p_request *req)
{

        switch (req->lr_req.tlock.type) {
        case L9PL_LOCK_TYPE_RDLOCK:
        case L9PL_LOCK_TYPE_WRLOCK:
        case L9PL_LOCK_TYPE_UNLOCK:
                break;
        default:
                return (EINVAL);
        }

        req->lr_resp.rlock.status = L9PL_LOCK_SUCCESS;
        return (0);
}

static int
fs_getlock(void *softc __unused, struct l9p_request *req)
{

        /*
         * Client wants to see if a request to lock a region would
         * block.  This is, of course, not atomic anyway, so the
         * op is useless.  QEMU simply says "unlocked!", so we do
         * too.
         */
        switch (req->lr_req.getlock.type) {
        case L9PL_LOCK_TYPE_RDLOCK:
        case L9PL_LOCK_TYPE_WRLOCK:
        case L9PL_LOCK_TYPE_UNLOCK:
                break;
        default:
                return (EINVAL);
        }

        req->lr_resp.getlock = req->lr_req.getlock;
        req->lr_resp.getlock.type = L9PL_LOCK_TYPE_UNLOCK;
        req->lr_resp.getlock.client_id = strdup("");  /* XXX what should go here? */
        return (0);
}

static int
fs_link(void *softc __unused, struct l9p_request *req)
{
        struct l9p_fid *dir;
        struct fs_fid *file;
        struct fs_fid *dirf;
        struct stat fst, tdst;
        int32_t op;
        char *name;
        char newname[MAXPATHLEN];
        int error;

        /* N.B.: lr_fid is the file to link, lr_fid2 is the target dir */
        dir = req->lr_fid2;
        dirf = dir->lo_aux;
        assert(dirf != NULL);

        name = req->lr_req.tlink.name;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);

        file = req->lr_fid->lo_aux;
        assert(file != NULL);

        if (fstatat(dirf->ff_dirfd, dirf->ff_name, &tdst, AT_SYMLINK_NOFOLLOW) != 0 ||
            fstatat(file->ff_dirfd, file->ff_name, &fst, AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);
        if (S_ISDIR(fst.st_mode))
                return (EISDIR);
        fillacl(dirf);
        op = S_ISDIR(fst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
        error = check_access(op,
            dirf->ff_acl, &tdst, NULL, NULL, file->ff_ai, (gid_t)-1);
        if (error)
                return (error);

        if (linkat(file->ff_dirfd, file->ff_name, file->ff_dirfd,
            newname, 0) != 0)
                error = errno;
        else
                dropacl(file);

        return (error);
}

static int
fs_mkdir(void *softc, struct l9p_request *req)
{
        struct l9p_fid *dir;
        struct stat st;
        mode_t perm;
        gid_t gid;
        char *name;
        int error;

        dir = req->lr_fid;
        name = req->lr_req.tmkdir.name;
        perm = (mode_t)req->lr_req.tmkdir.mode;
        gid = req->lr_req.tmkdir.gid;

        error = fs_imkdir(softc, dir, name, false, perm, gid, &st);
        if (error == 0)
                generate_qid(&st, &req->lr_resp.rmkdir.qid);
        return (error);
}

static int
fs_renameat(void *softc, struct l9p_request *req)
{
        struct fs_softc *sc = softc;
        struct l9p_fid *olddir, *newdir;
        struct l9p_acl *facl;
        struct fs_fid *off, *nff;
        struct stat odst, ndst, fst;
        int32_t op;
        bool reparenting;
        char *onp, *nnp;
        char onb[MAXPATHLEN], nnb[MAXPATHLEN];
        int error;

        if (sc->fs_readonly)
                return (EROFS);

        olddir = req->lr_fid;
        newdir = req->lr_fid2;
        assert(olddir != NULL && newdir != NULL);
        off = olddir->lo_aux;
        nff = newdir->lo_aux;
        assert(off != NULL && nff != NULL);

        onp = req->lr_req.trenameat.oldname;
        nnp = req->lr_req.trenameat.newname;
        error = fs_buildname(olddir, onp, onb, sizeof(onb));
        if (error)
                return (error);
        error = fs_buildname(newdir, nnp, nnb, sizeof(nnb));
        if (error)
                return (error);
        if (fstatat(off->ff_dirfd, onb, &fst, AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);

        reparenting = olddir != newdir &&
            strcmp(off->ff_name, nff->ff_name) != 0;

        if (fstatat(off->ff_dirfd, off->ff_name, &odst, AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);
        if (!S_ISDIR(odst.st_mode))
                return (ENOTDIR);
        fillacl(off);

        if (reparenting) {
                if (fstatat(nff->ff_dirfd, nff->ff_name, &ndst, AT_SYMLINK_NOFOLLOW) != 0)
                        return (errno);
                if (!S_ISDIR(ndst.st_mode))
                        return (ENOTDIR);
                facl = getacl(off, -1, onb);
                fillacl(nff);

                error = check_access(L9P_ACOP_UNLINK,
                    off->ff_acl, &odst, facl, &fst, off->ff_ai, (gid_t)-1);
                l9p_acl_free(facl);
                if (error)
                        return (error);
                op = S_ISDIR(fst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY :
                    L9P_ACE_ADD_FILE;
                error = check_access(op,
                    nff->ff_acl, &ndst, NULL, NULL, nff->ff_ai, (gid_t)-1);
                if (error)
                        return (error);
        }

        if (renameat(off->ff_dirfd, onb, nff->ff_dirfd, nnb))
                error = errno;

        return (error);
}

/*
 * Unlink file in given directory, or remove directory in given
 * directory, based on flags.
 */
static int
fs_unlinkat(void *softc, struct l9p_request *req)
{
        struct fs_softc *sc = softc;
        struct l9p_acl *facl;
        struct l9p_fid *dir;
        struct fs_fid *dirff;
        struct stat dirst, fst;
        char *name;
        char newname[MAXPATHLEN];
        int error;

        if (sc->fs_readonly)
                return (EROFS);

        dir = req->lr_fid;
        dirff = dir->lo_aux;
        assert(dirff != NULL);
        name = req->lr_req.tunlinkat.name;
        error = fs_buildname(dir, name, newname, sizeof(newname));
        if (error)
                return (error);
        if (fstatat(dirff->ff_dirfd, newname, &fst, AT_SYMLINK_NOFOLLOW) != 0 ||
            fstatat(dirff->ff_dirfd, dirff->ff_name, &dirst, AT_SYMLINK_NOFOLLOW) != 0)
                return (errno);
        fillacl(dirff);
        facl = getacl(dirff, -1, newname);
        error = check_access(L9P_ACOP_UNLINK,
            dirff->ff_acl, &dirst, facl, &fst, dirff->ff_ai, (gid_t)-1);
        l9p_acl_free(facl);
        if (error)
                return (error);

        if (req->lr_req.tunlinkat.flags & L9PL_AT_REMOVEDIR) {
                if (unlinkat(dirff->ff_dirfd, newname, AT_REMOVEDIR) != 0) {
                        error = errno;
                        if (error == EEXIST)
                                error = ENOTEMPTY;
                }
        } else {
                if (unlinkat(dirff->ff_dirfd, newname, 0) != 0)
                        error = errno;
        }
        return (error);
}

static void
fs_freefid(void *softc __unused, struct l9p_fid *fid)
{
        struct fs_fid *f = fid->lo_aux;
        struct fs_authinfo *ai;
        uint32_t newcount;

        if (f == NULL) {
                /* Nothing to do here */
                return;
        }

        if (f->ff_fd != -1)
                close(f->ff_fd);

        if (f->ff_dir)
                closedir(f->ff_dir);

        (void) pthread_mutex_destroy(&f->ff_mtx);
        free(f->ff_name);
        ai = f->ff_ai;
        l9p_acl_free(f->ff_acl);
        free(f);
        (void) pthread_mutex_lock(&ai->ai_mtx);
        newcount = --ai->ai_refcnt;
        (void) pthread_mutex_unlock(&ai->ai_mtx);
        if (newcount == 0) {
                /*
                 * We *were* the last ref, no one can have gained a ref.
                 */
                L9P_LOG(L9P_DEBUG, "dropped last ref to authinfo %p",
                    (void *)ai);
                (void) pthread_mutex_destroy(&ai->ai_mtx);
                free(ai);
        } else {
                L9P_LOG(L9P_DEBUG, "authinfo %p now used by %lu",
                    (void *)ai, (u_long)newcount);
        }
}

int
l9p_backend_fs_init(struct l9p_backend **backendp, int rootfd, bool ro)
{
        struct l9p_backend *backend;
        struct fs_softc *sc;
        int error;
#if defined(WITH_CASPER)
        cap_channel_t *capcas;
#endif

        if (!fs_attach_mutex_inited) {
#ifdef __illumos__
                if ((error = pthread_mutexattr_init(&fs_mutexattr)) != 0) {
                        errno = error;
                        return (-1);
                }
                if ((error = pthread_mutexattr_settype(&fs_mutexattr,
                    PTHREAD_MUTEX_ERRORCHECK)) != 0) {
                        errno = error;
                        return (-1);
                }
                error = pthread_mutex_init(&fs_attach_mutex, &fs_mutexattr);
#else
                error = pthread_mutex_init(&fs_attach_mutex, NULL);
#endif
                if (error) {
                        errno = error;
                        return (-1);
                }
                fs_attach_mutex_inited = true;
        }

        backend = l9p_malloc(sizeof(*backend));
        backend->attach = fs_attach;
        backend->clunk = fs_clunk;
        backend->create = fs_create;
        backend->open = fs_open;
        backend->read = fs_read;
        backend->remove = fs_remove;
        backend->stat = fs_stat;
        backend->walk = fs_walk;
        backend->write = fs_write;
        backend->wstat = fs_wstat;
        backend->statfs = fs_statfs;
        backend->lopen = fs_lopen;
        backend->lcreate = fs_lcreate;
        backend->symlink = fs_symlink;
        backend->mknod = fs_mknod;
        backend->rename = fs_rename;
        backend->readlink = fs_readlink;
        backend->getattr = fs_getattr;
        backend->setattr = fs_setattr;
        backend->xattrwalk = fs_xattrwalk;
        backend->xattrcreate = fs_xattrcreate;
        backend->readdir = fs_readdir;
        backend->fsync = fs_fsync;
        backend->lock = fs_lock;
        backend->getlock = fs_getlock;
        backend->link = fs_link;
        backend->mkdir = fs_mkdir;
        backend->renameat = fs_renameat;
        backend->unlinkat = fs_unlinkat;
        backend->freefid = fs_freefid;

        sc = l9p_malloc(sizeof(*sc));
        sc->fs_rootfd = rootfd;
        sc->fs_readonly = ro;
        backend->softc = sc;

#if defined(__illumos__)
        if (fpathconf(rootfd, _PC_XATTR_ENABLED) > 0)
                sc->fs_hasxattr = 1;
#endif

#if defined(WITH_CASPER)
        capcas = cap_init();
        if (capcas == NULL)
                return (-1);

        sc->fs_cappwd = cap_service_open(capcas, "system.pwd");
        if (sc->fs_cappwd == NULL)
                return (-1);

        sc->fs_capgrp = cap_service_open(capcas, "system.grp");
        if (sc->fs_capgrp == NULL)
                return (-1);

        cap_setpassent(sc->fs_cappwd, 1);
        cap_setgroupent(sc->fs_capgrp, 1);
        cap_close(capcas);
#elif defined(__illumos__)
        setpwent();
#else
        setpassent(1);
#endif

        *backendp = backend;
        return (0);
}

#ifdef __illumos__
acl_t *
acl_get_fd_np(int fd, int type)
{
        acl_t *acl;
        int flag, ret;

        flag = 0;
        if (type == ACL_TYPE_NFS4)
                flag = ACL_NO_TRIVIAL;

        ret = facl_get(fd, flag, &acl);
        if (ret != 0)
                return (NULL);

        return (acl);
}

static void
getcrtime(struct fs_softc *sc, int dirfd, const char *fname, uint64_t *secp,
    uint64_t *nsp)
{
        nvlist_t *nvl = NULL;
        uint64_t *vals = NULL;
        uint_t nvals = 0;
        int error;

        *secp = 0;
        *nsp = 0;

        if (!sc->fs_hasxattr)
                return;

        if ((error = getattrat(dirfd, XATTR_VIEW_READWRITE, fname, &nvl)) != 0)
                return;

        if (nvlist_lookup_uint64_array(nvl, "crtime", &vals, &nvals) != 0)
                goto done;

        if (nvals != 2)
                goto done;

        *secp = vals[0];
        *nsp = vals[1];

done:
        nvlist_free(nvl);
}
#endif