usr/src/uts/common/fs/udfs/udf_vnops.c

root/usr/src/uts/common/fs/udfs/udf_vnops.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Copyright 2015, Joyent, Inc.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vfs_opreg.h>
#include <sys/stat.h>
#include <sys/vnode.h>
#include <sys/mode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/dnlc.h>
#include <sys/conf.h>
#include <sys/errno.h>
#include <sys/mman.h>
#include <sys/fbuf.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/dirent.h>
#include <sys/errno.h>
#include <sys/modctl.h>
#include <sys/statvfs.h>
#include <sys/mount.h>
#include <sys/sunddi.h>
#include <sys/bootconf.h>
#include <sys/policy.h>

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_kmem.h>
#include <vm/seg_vn.h>
#include <vm/rm.h>
#include <vm/page.h>
#include <sys/swap.h>

#include <fs/fs_subr.h>

#include <sys/fs/udf_volume.h>
#include <sys/fs/udf_inode.h>

static int32_t udf_open(struct vnode **,
        int32_t, struct cred *, caller_context_t *);
static int32_t udf_close(struct vnode *,
        int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
static int32_t udf_read(struct vnode *,
        struct uio *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_write(struct vnode *,
        struct uio *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_ioctl(struct vnode *,
        int32_t, intptr_t, int32_t, struct cred *, int32_t *,
        caller_context_t *);
static int32_t udf_getattr(struct vnode *,
        struct vattr *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_setattr(struct vnode *,
        struct vattr *, int32_t, struct cred *, caller_context_t *);
static int32_t udf_access(struct vnode *,
        int32_t, int32_t, struct cred *, caller_context_t *);
static int32_t udf_lookup(struct vnode *,
        char *, struct vnode **, struct pathname *,
        int32_t, struct vnode *, struct cred *,
        caller_context_t *, int *, pathname_t *);
static int32_t udf_create(struct vnode *,
        char *, struct vattr *, enum vcexcl,
        int32_t, struct vnode **, struct cred *, int32_t,
        caller_context_t *, vsecattr_t *);
static int32_t udf_remove(struct vnode *,
        char *, struct cred *, caller_context_t *, int);
static int32_t udf_link(struct vnode *,
        struct vnode *, char *, struct cred *, caller_context_t *, int);
static int32_t udf_rename(struct vnode *,
        char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
static int32_t udf_mkdir(struct vnode *,
        char *, struct vattr *, struct vnode **, struct cred *,
        caller_context_t *, int, vsecattr_t *);
static int32_t udf_rmdir(struct vnode *,
        char *, struct vnode *, struct cred *, caller_context_t *, int);
static int32_t udf_readdir(struct vnode *,
        struct uio *, struct cred *, int32_t *, caller_context_t *, int);
static int32_t udf_symlink(struct vnode *,
        char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
static int32_t udf_readlink(struct vnode *,
        struct uio *, struct cred *, caller_context_t *);
static int32_t udf_fsync(struct vnode *,
        int32_t, struct cred *, caller_context_t *);
static void udf_inactive(struct vnode *,
        struct cred *, caller_context_t *);
static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
        caller_context_t *);
static int32_t udf_frlock(struct vnode *, int32_t,
        struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
        caller_context_t *);
static int32_t udf_space(struct vnode *, int32_t,
        struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
static int32_t udf_getpage(struct vnode *, offset_t,
        size_t, uint32_t *, struct page **, size_t,
        struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
static int32_t udf_putpage(struct vnode *, offset_t,
        size_t, int32_t, struct cred *, caller_context_t *);
static int32_t udf_map(struct vnode *, offset_t, struct as *,
        caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
        caller_context_t *);
static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
        caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
        caller_context_t *);
static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
        caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
        caller_context_t *);
static int32_t udf_l_pathconf(struct vnode *, int32_t,
        ulong_t *, struct cred *, caller_context_t *);
static int32_t udf_pageio(struct vnode *, struct page *,
        u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);

int32_t ud_getpage_miss(struct vnode *, u_offset_t,
        size_t, struct seg *, caddr_t, page_t *pl[],
        size_t, enum seg_rw, int32_t);
void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
int32_t ud_page_fill(struct ud_inode *, page_t *,
        u_offset_t, uint32_t, u_offset_t *);
int32_t ud_iodone(struct buf *);
int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
int32_t ud_slave_done(struct buf *);

/*
 * Structures to control multiple IO operations to get or put pages
 * that are backed by discontiguous blocks. The master struct is
 * a dummy that holds the original bp from pageio_setup. The
 * slave struct holds the working bp's to do the actual IO. Once
 * all the slave IOs complete. The master is processed as if a single
 * IO op has completed.
 */
uint32_t master_index = 0;
typedef struct mio_master {
        kmutex_t        mm_mutex;       /* protect the fields below */
        int32_t         mm_size;
        buf_t           *mm_bp;         /* original bp */
        int32_t         mm_resid;       /* bytes remaining to transfer */
        int32_t         mm_error;       /* accumulated error from slaves */
        int32_t         mm_index;       /* XXX debugging */
} mio_master_t;

typedef struct mio_slave {
        buf_t           ms_buf;         /* working buffer for this IO chunk */
        mio_master_t    *ms_ptr;        /* pointer to master */
} mio_slave_t;

struct vnodeops *udf_vnodeops;

const fs_operation_def_t udf_vnodeops_template[] = {
        VOPNAME_OPEN,           { .vop_open = udf_open },
        VOPNAME_CLOSE,          { .vop_close = udf_close },
        VOPNAME_READ,           { .vop_read = udf_read },
        VOPNAME_WRITE,          { .vop_write = udf_write },
        VOPNAME_IOCTL,          { .vop_ioctl = udf_ioctl },
        VOPNAME_GETATTR,        { .vop_getattr = udf_getattr },
        VOPNAME_SETATTR,        { .vop_setattr = udf_setattr },
        VOPNAME_ACCESS,         { .vop_access = udf_access },
        VOPNAME_LOOKUP,         { .vop_lookup = udf_lookup },
        VOPNAME_CREATE,         { .vop_create = udf_create },
        VOPNAME_REMOVE,         { .vop_remove = udf_remove },
        VOPNAME_LINK,           { .vop_link = udf_link },
        VOPNAME_RENAME,         { .vop_rename = udf_rename },
        VOPNAME_MKDIR,          { .vop_mkdir = udf_mkdir },
        VOPNAME_RMDIR,          { .vop_rmdir = udf_rmdir },
        VOPNAME_READDIR,        { .vop_readdir = udf_readdir },
        VOPNAME_SYMLINK,        { .vop_symlink = udf_symlink },
        VOPNAME_READLINK,       { .vop_readlink = udf_readlink },
        VOPNAME_FSYNC,          { .vop_fsync = udf_fsync },
        VOPNAME_INACTIVE,       { .vop_inactive = udf_inactive },
        VOPNAME_FID,            { .vop_fid = udf_fid },
        VOPNAME_RWLOCK,         { .vop_rwlock = udf_rwlock },
        VOPNAME_RWUNLOCK,       { .vop_rwunlock = udf_rwunlock },
        VOPNAME_SEEK,           { .vop_seek = udf_seek },
        VOPNAME_FRLOCK,         { .vop_frlock = udf_frlock },
        VOPNAME_SPACE,          { .vop_space = udf_space },
        VOPNAME_GETPAGE,        { .vop_getpage = udf_getpage },
        VOPNAME_PUTPAGE,        { .vop_putpage = udf_putpage },
        VOPNAME_MAP,            { .vop_map = udf_map },
        VOPNAME_ADDMAP,         { .vop_addmap = udf_addmap },
        VOPNAME_DELMAP,         { .vop_delmap = udf_delmap },
        VOPNAME_PATHCONF,       { .vop_pathconf = udf_l_pathconf },
        VOPNAME_PAGEIO,         { .vop_pageio = udf_pageio },
        VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
        NULL,                   NULL
};

/* ARGSUSED */
static int32_t
udf_open(
        struct vnode **vpp,
        int32_t flag,
        struct cred *cr,
        caller_context_t *ct)
{
        ud_printf("udf_open\n");

        return (0);
}

/* ARGSUSED */
static int32_t
udf_close(
        struct vnode *vp,
        int32_t flag,
        int32_t count,
        offset_t offset,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_close\n");

        ITIMES(ip);

        cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
        cleanshares(vp, ttoproc(curthread)->p_pid);

        /*
         * Push partially filled cluster at last close.
         * ``last close'' is approximated because the dnlc
         * may have a hold on the vnode.
         */
        if (vp->v_count <= 2 && vp->v_type != VBAD) {
                struct ud_inode *ip = VTOI(vp);
                if (ip->i_delaylen) {
                        (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
                            B_ASYNC | B_FREE, cr);
                        ip->i_delaylen = 0;
                }
        }

        return (0);
}

/* ARGSUSED */
static int32_t
udf_read(
        struct vnode *vp,
        struct uio *uiop,
        int32_t ioflag,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);
        int32_t error;

        ud_printf("udf_read\n");

#ifdef  __lock_lint
        rw_enter(&ip->i_rwlock, RW_READER);
#endif

        ASSERT(RW_READ_HELD(&ip->i_rwlock));

        if (MANDLOCK(vp, ip->i_char)) {
                /*
                 * udf_getattr ends up being called by chklock
                 */
                error = chklock(vp, FREAD, uiop->uio_loffset,
                    uiop->uio_resid, uiop->uio_fmode, ct);
                if (error) {
                        goto end;
                }
        }

        rw_enter(&ip->i_contents, RW_READER);
        error = ud_rdip(ip, uiop, ioflag, cr);
        rw_exit(&ip->i_contents);

end:
#ifdef  __lock_lint
        rw_exit(&ip->i_rwlock);
#endif

        return (error);
}


int32_t ud_WRITES = 1;
int32_t ud_HW = 96 * 1024;
int32_t ud_LW = 64 * 1024;
int32_t ud_throttles = 0;

/* ARGSUSED */
static int32_t
udf_write(
        struct vnode *vp,
        struct uio *uiop,
        int32_t ioflag,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);
        int32_t error = 0;

        ud_printf("udf_write\n");

#ifdef  __lock_lint
        rw_enter(&ip->i_rwlock, RW_WRITER);
#endif

        ASSERT(RW_WRITE_HELD(&ip->i_rwlock));

        if (MANDLOCK(vp, ip->i_char)) {
                /*
                 * ud_getattr ends up being called by chklock
                 */
                error = chklock(vp, FWRITE, uiop->uio_loffset,
                    uiop->uio_resid, uiop->uio_fmode, ct);
                if (error) {
                        goto end;
                }
        }
        /*
         * Throttle writes.
         */
        mutex_enter(&ip->i_tlock);
        if (ud_WRITES && (ip->i_writes > ud_HW)) {
                while (ip->i_writes > ud_HW) {
                        ud_throttles++;
                        cv_wait(&ip->i_wrcv, &ip->i_tlock);
                }
        }
        mutex_exit(&ip->i_tlock);

        /*
         * Write to the file
         */
        rw_enter(&ip->i_contents, RW_WRITER);
        if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
                /*
                 * In append mode start at end of file.
                 */
                uiop->uio_loffset = ip->i_size;
        }
        error = ud_wrip(ip, uiop, ioflag, cr);
        rw_exit(&ip->i_contents);

end:
#ifdef  __lock_lint
        rw_exit(&ip->i_rwlock);
#endif

        return (error);
}

/* ARGSUSED */
static int32_t
udf_ioctl(
        struct vnode *vp,
        int32_t cmd,
        intptr_t arg,
        int32_t flag,
        struct cred *cr,
        int32_t *rvalp,
        caller_context_t *ct)
{
        return (ENOTTY);
}

/* ARGSUSED */
static int32_t
udf_getattr(
        struct vnode *vp,
        struct vattr *vap,
        int32_t flags,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_getattr\n");

        if (vap->va_mask == AT_SIZE) {
                /*
                 * for performance, if only the size is requested don't bother
                 * with anything else.
                 */
                vap->va_size = ip->i_size;
                return (0);
        }

        rw_enter(&ip->i_contents, RW_READER);

        vap->va_type = vp->v_type;
        vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;

        vap->va_uid = ip->i_uid;
        vap->va_gid = ip->i_gid;
        vap->va_fsid = ip->i_dev;
        vap->va_nodeid = ip->i_icb_lbano;
        vap->va_nlink = ip->i_nlink;
        vap->va_size = ip->i_size;
        vap->va_seq = ip->i_seq;
        if (vp->v_type == VCHR || vp->v_type == VBLK) {
                vap->va_rdev = ip->i_rdev;
        } else {
                vap->va_rdev = 0;
        }

        mutex_enter(&ip->i_tlock);
        ITIMES_NOLOCK(ip);      /* mark correct time in inode */
        vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
        vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
        vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
        vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
        vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
        vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
        mutex_exit(&ip->i_tlock);

        switch (ip->i_type) {
                case VBLK:
                        vap->va_blksize = MAXBSIZE;
                        break;
                case VCHR:
                        vap->va_blksize = MAXBSIZE;
                        break;
                default:
                        vap->va_blksize = ip->i_udf->udf_lbsize;
                        break;
        }
        vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;

        rw_exit(&ip->i_contents);

        return (0);
}

static int
ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
{
        return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
}

/*ARGSUSED4*/
static int32_t
udf_setattr(
        struct vnode *vp,
        struct vattr *vap,
        int32_t flags,
        struct cred *cr,
        caller_context_t *ct)
{
        int32_t error = 0;
        uint32_t mask = vap->va_mask;
        struct ud_inode *ip;
        timestruc_t now;
        struct vattr ovap;

        ud_printf("udf_setattr\n");

        ip = VTOI(vp);

        /*
         * not updates allowed to 4096 files
         */
        if (ip->i_astrat == STRAT_TYPE4096) {
                return (EINVAL);
        }

        /*
         * Cannot set these attributes
         */
        if (mask & AT_NOSET) {
                return (EINVAL);
        }

        rw_enter(&ip->i_rwlock, RW_WRITER);
        rw_enter(&ip->i_contents, RW_WRITER);

        ovap.va_uid = ip->i_uid;
        ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
        error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
            ud_iaccess_vmode, ip);
        if (error)
                goto update_inode;

        mask = vap->va_mask;
        /*
         * Change file access modes.
         */
        if (mask & AT_MODE) {
                ip->i_perm = VA2UD_PERM(vap->va_mode);
                ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
                mutex_enter(&ip->i_tlock);
                ip->i_flag |= ICHG;
                mutex_exit(&ip->i_tlock);
        }
        if (mask & (AT_UID|AT_GID)) {
                if (mask & AT_UID) {
                        ip->i_uid = vap->va_uid;
                }
                if (mask & AT_GID) {
                        ip->i_gid = vap->va_gid;
                }
                mutex_enter(&ip->i_tlock);
                ip->i_flag |= ICHG;
                mutex_exit(&ip->i_tlock);
        }
        /*
         * Truncate file.  Must have write permission and not be a directory.
         */
        if (mask & AT_SIZE) {
                if (vp->v_type == VDIR) {
                        error = EISDIR;
                        goto update_inode;
                }
                if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
                        goto update_inode;
                }
                if (vap->va_size > MAXOFFSET_T) {
                        error = EFBIG;
                        goto update_inode;
                }
                if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
                        goto update_inode;
                }

                if (vap->va_size == 0)
                        vnevent_truncate(vp, ct);
        }
        /*
         * Change file access or modified times.
         */
        if (mask & (AT_ATIME|AT_MTIME)) {
                mutex_enter(&ip->i_tlock);
                if (mask & AT_ATIME) {
                        ip->i_atime.tv_sec = vap->va_atime.tv_sec;
                        ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
                        ip->i_flag &= ~IACC;
                }
                if (mask & AT_MTIME) {
                        ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
                        ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
                        gethrestime(&now);
                        ip->i_ctime.tv_sec = now.tv_sec;
                        ip->i_ctime.tv_nsec = now.tv_nsec;
                        ip->i_flag &= ~(IUPD|ICHG);
                        ip->i_flag |= IMODTIME;
                }
                ip->i_flag |= IMOD;
                mutex_exit(&ip->i_tlock);
        }

update_inode:
        if (curthread->t_flag & T_DONTPEND) {
                ud_iupdat(ip, 1);
        } else {
                ITIMES_NOLOCK(ip);
        }
        rw_exit(&ip->i_contents);
        rw_exit(&ip->i_rwlock);

        return (error);
}

/* ARGSUSED */
static int32_t
udf_access(
        struct vnode *vp,
        int32_t mode,
        int32_t flags,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_access\n");

        if (ip->i_udf == NULL) {
                return (EIO);
        }

        return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
}

int32_t udfs_stickyhack = 1;

/* ARGSUSED */
static int32_t
udf_lookup(
        struct vnode *dvp,
        char *nm,
        struct vnode **vpp,
        struct pathname *pnp,
        int32_t flags,
        struct vnode *rdir,
        struct cred *cr,
        caller_context_t *ct,
        int *direntflags,
        pathname_t *realpnp)
{
        int32_t error;
        struct vnode *vp;
        struct ud_inode *ip, *xip;

        ud_printf("udf_lookup\n");
        /*
         * Null component name is a synonym for directory being searched.
         */
        if (*nm == '\0') {
                VN_HOLD(dvp);
                *vpp = dvp;
                error = 0;
                goto out;
        }

        /*
         * Fast path: Check the directory name lookup cache.
         */
        ip = VTOI(dvp);
        if (vp = dnlc_lookup(dvp, nm)) {
                /*
                 * Check accessibility of directory.
                 */
                if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
                        VN_RELE(vp);
                }
                xip = VTOI(vp);
        } else {
                error = ud_dirlook(ip, nm, &xip, cr, 1);
                ITIMES(ip);
        }

        if (error == 0) {
                ip = xip;
                *vpp = ITOV(ip);
                if ((ip->i_type != VDIR) &&
                    (ip->i_char & ISVTX) &&
                    ((ip->i_perm & IEXEC) == 0) &&
                    udfs_stickyhack) {
                        mutex_enter(&(*vpp)->v_lock);
                        (*vpp)->v_flag |= VISSWAP;
                        mutex_exit(&(*vpp)->v_lock);
                }
                ITIMES(ip);
                /*
                 * If vnode is a device return special vnode instead.
                 */
                if (IS_DEVVP(*vpp)) {
                        struct vnode *newvp;
                        newvp = specvp(*vpp, (*vpp)->v_rdev,
                            (*vpp)->v_type, cr);
                        VN_RELE(*vpp);
                        if (newvp == NULL) {
                                error = ENOSYS;
                        } else {
                                *vpp = newvp;
                        }
                }
        }
out:
        return (error);
}

/* ARGSUSED */
static int32_t
udf_create(
        struct vnode *dvp,
        char *name,
        struct vattr *vap,
        enum vcexcl excl,
        int32_t mode,
        struct vnode **vpp,
        struct cred *cr,
        int32_t flag,
        caller_context_t *ct,
        vsecattr_t *vsecp)
{
        int32_t error;
        struct ud_inode *ip = VTOI(dvp), *xip;

        ud_printf("udf_create\n");

        if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
                vap->va_mode &= ~VSVTX;

        if (*name == '\0') {
                /*
                 * Null component name refers to the directory itself.
                 */
                VN_HOLD(dvp);
                ITIMES(ip);
                error = EEXIST;
        } else {
                xip = NULL;
                rw_enter(&ip->i_rwlock, RW_WRITER);
                error = ud_direnter(ip, name, DE_CREATE,
                    (struct ud_inode *)0, (struct ud_inode *)0,
                    vap, &xip, cr, ct);
                rw_exit(&ip->i_rwlock);
                ITIMES(ip);
                ip = xip;
        }
#ifdef  __lock_lint
        rw_enter(&ip->i_contents, RW_WRITER);
#else
        if (ip != NULL) {
                rw_enter(&ip->i_contents, RW_WRITER);
        }
#endif

        /*
         * If the file already exists and this is a non-exclusive create,
         * check permissions and allow access for non-directories.
         * Read-only create of an existing directory is also allowed.
         * We fail an exclusive create of anything which already exists.
         */
        if (error == EEXIST) {
                if (excl == NONEXCL) {
                        if ((ip->i_type == VDIR) && (mode & VWRITE)) {
                                error = EISDIR;
                        } else if (mode) {
                                error = ud_iaccess(ip,
                                    UD_UPERM2DPERM(mode), cr, 0);
                        } else {
                                error = 0;
                        }
                }
                if (error) {
                        rw_exit(&ip->i_contents);
                        VN_RELE(ITOV(ip));
                        goto out;
                } else if ((ip->i_type == VREG) &&
                    (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
                        /*
                         * Truncate regular files, if requested by caller.
                         * Grab i_rwlock to make sure no one else is
                         * currently writing to the file (we promised
                         * bmap we would do this).
                         * Must get the locks in the correct order.
                         */
                        if (ip->i_size == 0) {
                                ip->i_flag |= ICHG | IUPD;
                        } else {
                                rw_exit(&ip->i_contents);
                                rw_enter(&ip->i_rwlock, RW_WRITER);
                                rw_enter(&ip->i_contents, RW_WRITER);
                                (void) ud_itrunc(ip, 0, 0, cr);
                                rw_exit(&ip->i_rwlock);
                        }
                        vnevent_create(ITOV(ip), ct);
                }
        }

        if (error == 0) {
                *vpp = ITOV(ip);
                ITIMES(ip);
        }
#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#else
        if (ip != NULL) {
                rw_exit(&ip->i_contents);
        }
#endif
        if (error) {
                goto out;
        }

        /*
         * If vnode is a device return special vnode instead.
         */
        if (!error && IS_DEVVP(*vpp)) {
                struct vnode *newvp;

                newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
                VN_RELE(*vpp);
                if (newvp == NULL) {
                        error = ENOSYS;
                        goto out;
                }
                *vpp = newvp;
        }
out:
        return (error);
}

/* ARGSUSED */
static int32_t
udf_remove(
        struct vnode *vp,
        char *nm,
        struct cred *cr,
        caller_context_t *ct,
        int flags)
{
        int32_t error;
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_remove\n");

        rw_enter(&ip->i_rwlock, RW_WRITER);
        error = ud_dirremove(ip, nm,
            (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
        rw_exit(&ip->i_rwlock);
        ITIMES(ip);

        return (error);
}

/* ARGSUSED */
static int32_t
udf_link(
        struct vnode *tdvp,
        struct vnode *svp,
        char *tnm,
        struct cred *cr,
        caller_context_t *ct,
        int flags)
{
        int32_t error;
        struct vnode *realvp;
        struct ud_inode *sip;
        struct ud_inode *tdp;

        ud_printf("udf_link\n");
        if (VOP_REALVP(svp, &realvp, ct) == 0) {
                svp = realvp;
        }

        /*
         * Do not allow links to directories
         */
        if (svp->v_type == VDIR) {
                return (EPERM);
        }

        sip = VTOI(svp);

        if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
                return (EPERM);

        tdp = VTOI(tdvp);

        rw_enter(&tdp->i_rwlock, RW_WRITER);
        error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
            sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
        rw_exit(&tdp->i_rwlock);
        ITIMES(sip);
        ITIMES(tdp);

        if (error == 0) {
                vnevent_link(svp, ct);
        }

        return (error);
}

/* ARGSUSED */
static int32_t
udf_rename(
        struct vnode *sdvp,
        char *snm,
        struct vnode *tdvp,
        char *tnm,
        struct cred *cr,
        caller_context_t *ct,
        int flags)
{
        int32_t error = 0;
        struct udf_vfs *udf_vfsp;
        struct ud_inode *sip;           /* source inode */
        struct ud_inode *tip;           /* target inode */
        struct ud_inode *sdp, *tdp;     /* source and target parent inode */
        struct vnode *realvp;

        ud_printf("udf_rename\n");

        if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
                tdvp = realvp;
        }

        sdp = VTOI(sdvp);
        tdp = VTOI(tdvp);

        udf_vfsp = sdp->i_udf;

        mutex_enter(&udf_vfsp->udf_rename_lck);
        /*
         * Look up inode of file we're supposed to rename.
         */
        if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
                mutex_exit(&udf_vfsp->udf_rename_lck);
                return (error);
        }
        /*
         * be sure this is not a directory with another file system mounted
         * over it.  If it is just give up the locks, and return with
         * EBUSY
         */
        if (vn_mountedvfs(ITOV(sip)) != NULL) {
                error = EBUSY;
                goto errout;
        }
        /*
         * Make sure we can delete the source entry.  This requires
         * write permission on the containing directory.  If that
         * directory is "sticky" it further requires (except for
         * privileged users) that the user own the directory or the
         * source entry, or else have permission to write the source
         * entry.
         */
        rw_enter(&sdp->i_contents, RW_READER);
        rw_enter(&sip->i_contents, RW_READER);
        if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
            (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
                rw_exit(&sip->i_contents);
                rw_exit(&sdp->i_contents);
                ITIMES(sip);
                goto errout;
        }

        /*
         * Check for renaming '.' or '..' or alias of '.'
         */
        if ((strcmp(snm, ".") == 0) ||
            (strcmp(snm, "..") == 0) ||
            (sdp == sip)) {
                error = EINVAL;
                rw_exit(&sip->i_contents);
                rw_exit(&sdp->i_contents);
                goto errout;
        }

        rw_exit(&sip->i_contents);
        rw_exit(&sdp->i_contents);

        if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
                vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
                VN_RELE(ITOV(tip));
        }

        /* Notify the target dir. if not the same as the source dir. */
        if (sdvp != tdvp)
                vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);

        vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);

        /*
         * Link source to the target.
         */
        rw_enter(&tdp->i_rwlock, RW_WRITER);
        if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
            (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
                /*
                 * ESAME isn't really an error; it indicates that the
                 * operation should not be done because the source and target
                 * are the same file, but that no error should be reported.
                 */
                if (error == ESAME) {
                        error = 0;
                }
                rw_exit(&tdp->i_rwlock);
                goto errout;
        }
        rw_exit(&tdp->i_rwlock);

        rw_enter(&sdp->i_rwlock, RW_WRITER);
        /*
         * Unlink the source.
         * Remove the source entry.  ud_dirremove() checks that the entry
         * still reflects sip, and returns an error if it doesn't.
         * If the entry has changed just forget about it.  Release
         * the source inode.
         */
        if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
            DR_RENAME, cr, ct)) == ENOENT) {
                error = 0;
        }
        rw_exit(&sdp->i_rwlock);

        if (error == 0) {
                vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
                /*
                 * vnevent_rename_dest and vnevent_rename_dest_dir are called
                 * in ud_direnter().
                 */
        }

errout:
        ITIMES(sdp);
        ITIMES(tdp);
        VN_RELE(ITOV(sip));
        mutex_exit(&udf_vfsp->udf_rename_lck);

        return (error);
}

/* ARGSUSED */
static int32_t
udf_mkdir(
        struct vnode *dvp,
        char *dirname,
        struct vattr *vap,
        struct vnode **vpp,
        struct cred *cr,
        caller_context_t *ct,
        int flags,
        vsecattr_t *vsecp)
{
        int32_t error;
        struct ud_inode *ip;
        struct ud_inode *xip;

        ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));

        ud_printf("udf_mkdir\n");

        ip = VTOI(dvp);
        rw_enter(&ip->i_rwlock, RW_WRITER);
        error = ud_direnter(ip, dirname, DE_MKDIR,
            (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
        rw_exit(&ip->i_rwlock);
        ITIMES(ip);
        if (error == 0) {
                ip = xip;
                *vpp = ITOV(ip);
                ITIMES(ip);
        } else if (error == EEXIST) {
                ITIMES(xip);
                VN_RELE(ITOV(xip));
        }

        return (error);
}

/* ARGSUSED */
static int32_t
udf_rmdir(
        struct vnode *vp,
        char *nm,
        struct vnode *cdir,
        struct cred *cr,
        caller_context_t *ct,
        int flags)
{
        int32_t error;
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_rmdir\n");

        rw_enter(&ip->i_rwlock, RW_WRITER);
        error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
            cr, ct);
        rw_exit(&ip->i_rwlock);
        ITIMES(ip);

        return (error);
}

/* ARGSUSED */
static int32_t
udf_readdir(
        struct vnode *vp,
        struct uio *uiop,
        struct cred *cr,
        int32_t *eofp,
        caller_context_t *ct,
        int flags)
{
        struct ud_inode *ip;
        struct dirent64 *nd;
        struct udf_vfs *udf_vfsp;
        int32_t error = 0, len, outcount = 0;
        uint32_t dirsiz, offset;
        uint32_t bufsize, ndlen, dummy;
        caddr_t outbuf;
        caddr_t outb, end_outb;
        struct iovec *iovp;

        uint8_t *dname;
        int32_t length;

        uint8_t *buf = NULL;

        struct fbuf *fbp = NULL;
        struct file_id *fid;
        uint8_t *name;


        ud_printf("udf_readdir\n");

        ip = VTOI(vp);
        udf_vfsp = ip->i_udf;

        dirsiz = ip->i_size;
        if ((uiop->uio_offset >= dirsiz) ||
            (ip->i_nlink <= 0)) {
                if (eofp) {
                        *eofp = 1;
                }
                return (0);
        }

        offset = uiop->uio_offset;
        iovp = uiop->uio_iov;
        bufsize = iovp->iov_len;

        outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
        end_outb = outb + bufsize;
        nd = (struct dirent64 *)outbuf;

        dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
        buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);

        if (offset == 0) {
                len = DIRENT64_RECLEN(1);
                if (((caddr_t)nd + len) >= end_outb) {
                        error = EINVAL;
                        goto end;
                }
                nd->d_ino = ip->i_icb_lbano;
                nd->d_reclen = (uint16_t)len;
                nd->d_off = 0x10;
                nd->d_name[0] = '.';
                bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
                nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
                outcount++;
        } else if (offset == 0x10) {
                offset = 0;
        }

        while (offset < dirsiz) {
                error = ud_get_next_fid(ip, &fbp,
                    offset, &fid, &name, buf);
                if (error != 0) {
                        break;
                }

                if ((fid->fid_flags & FID_DELETED) == 0) {
                        if (fid->fid_flags & FID_PARENT) {

                                len = DIRENT64_RECLEN(2);
                                if (((caddr_t)nd + len) >= end_outb) {
                                        error = EINVAL;
                                        break;
                                }

                                nd->d_ino = ip->i_icb_lbano;
                                nd->d_reclen = (uint16_t)len;
                                nd->d_off = offset + FID_LEN(fid);
                                nd->d_name[0] = '.';
                                nd->d_name[1] = '.';
                                bzero(&nd->d_name[2],
                                    DIRENT64_NAMELEN(len) - 2);
                                nd = (struct dirent64 *)
                                    ((char *)nd + nd->d_reclen);
                        } else {
                                if ((error = ud_uncompress(fid->fid_idlen,
                                    &length, name, dname)) != 0) {
                                        break;
                                }
                                if (length == 0) {
                                        offset += FID_LEN(fid);
                                        continue;
                                }
                                len = DIRENT64_RECLEN(length);
                                if (((caddr_t)nd + len) >= end_outb) {
                                        if (!outcount) {
                                                error = EINVAL;
                                        }
                                        break;
                                }
                                (void) strncpy(nd->d_name,
                                    (caddr_t)dname, length);
                                bzero(&nd->d_name[length],
                                    DIRENT64_NAMELEN(len) - length);
                                nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
                                    SWAP_16(fid->fid_icb.lad_ext_prn),
                                    SWAP_32(fid->fid_icb.lad_ext_loc), 1,
                                    &dummy);
                                nd->d_reclen = (uint16_t)len;
                                nd->d_off = offset + FID_LEN(fid);
                                nd = (struct dirent64 *)
                                    ((char *)nd + nd->d_reclen);
                        }
                        outcount++;
                }

                offset += FID_LEN(fid);
        }

end:
        if (fbp != NULL) {
                fbrelse(fbp, S_OTHER);
        }
        ndlen = ((char *)nd - outbuf);
        /*
         * In case of error do not call uiomove.
         * Return the error to the caller.
         */
        if ((error == 0) && (ndlen != 0)) {
                error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
                uiop->uio_offset = offset;
        }
        kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
        kmem_free((caddr_t)dname, 1024);
        kmem_free(outbuf, (uint32_t)bufsize);
        if (eofp && error == 0) {
                *eofp = (uiop->uio_offset >= dirsiz);
        }
        return (error);
}

/* ARGSUSED */
static int32_t
udf_symlink(
        struct vnode *dvp,
        char *linkname,
        struct vattr *vap,
        char *target,
        struct cred *cr,
        caller_context_t *ct,
        int flags)
{
        int32_t error = 0, outlen;
        uint32_t ioflag = 0;
        struct ud_inode *ip, *dip = VTOI(dvp);

        struct path_comp *pc;
        int8_t *dname = NULL, *uname = NULL, *sp;

        ud_printf("udf_symlink\n");

        ip = (struct ud_inode *)0;
        vap->va_type = VLNK;
        vap->va_rdev = 0;

        rw_enter(&dip->i_rwlock, RW_WRITER);
        error = ud_direnter(dip, linkname, DE_CREATE,
            (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
        rw_exit(&dip->i_rwlock);
        if (error == 0) {
                dname = kmem_zalloc(1024, KM_SLEEP);
                uname = kmem_zalloc(PAGESIZE, KM_SLEEP);

                pc = (struct path_comp *)uname;
                /*
                 * If the first character in target is "/"
                 * then skip it and create entry for it
                 */
                if (*target == '/') {
                        pc->pc_type = 2;
                        pc->pc_len = 0;
                        pc = (struct path_comp *)(((char *)pc) + 4);
                        while (*target == '/') {
                                target++;
                        }
                }

                while (*target != '\0') {
                        sp = target;
                        while ((*target != '/') && (*target != '\0')) {
                                target ++;
                        }
                        /*
                         * We got the next component of the
                         * path name. Create path_comp of
                         * appropriate type
                         */
                        if (((target - sp) == 1) && (*sp == '.')) {
                                /*
                                 * Dot entry.
                                 */
                                pc->pc_type = 4;
                                pc = (struct path_comp *)(((char *)pc) + 4);
                        } else if (((target - sp) == 2) &&
                            (*sp == '.') && ((*(sp + 1)) == '.')) {
                                /*
                                 * DotDot entry.
                                 */
                                pc->pc_type = 3;
                                pc = (struct path_comp *)(((char *)pc) + 4);
                        } else {
                                /*
                                 * convert the user given name
                                 * into appropriate form to be put
                                 * on the media
                                 */
                                outlen = 1024;  /* set to size of dname */
                                if (error = ud_compress(target - sp, &outlen,
                                    (uint8_t *)sp, (uint8_t *)dname)) {
                                        break;
                                }
                                pc->pc_type = 5;
                                /* LINTED */
                                pc->pc_len = outlen;
                                dname[outlen] = '\0';
                                (void) strcpy((char *)pc->pc_id, dname);
                                pc = (struct path_comp *)
                                    (((char *)pc) + 4 + outlen);
                        }
                        while (*target == '/') {
                                target++;
                        }
                        if (*target == '\0') {
                                break;
                        }
                }

                rw_enter(&ip->i_contents, RW_WRITER);
                if (error == 0) {
                        ioflag = FWRITE;
                        if (curthread->t_flag & T_DONTPEND) {
                                ioflag |= FDSYNC;
                        }
                        error = ud_rdwri(UIO_WRITE, ioflag, ip,
                            uname, ((int8_t *)pc) - uname,
                            (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
                }
                if (error) {
                        ud_idrop(ip);
                        rw_exit(&ip->i_contents);
                        rw_enter(&dip->i_rwlock, RW_WRITER);
                        (void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
                            (struct vnode *)0, DR_REMOVE, cr, ct);
                        rw_exit(&dip->i_rwlock);
                        goto update_inode;
                }
                rw_exit(&ip->i_contents);
        }

        if ((error == 0) || (error == EEXIST)) {
                VN_RELE(ITOV(ip));
        }

update_inode:
        ITIMES(VTOI(dvp));
        if (uname != NULL) {
                kmem_free(uname, PAGESIZE);
        }
        if (dname != NULL) {
                kmem_free(dname, 1024);
        }

        return (error);
}

/* ARGSUSED */
static int32_t
udf_readlink(
        struct vnode *vp,
        struct uio *uiop,
        struct cred *cr,
        caller_context_t *ct)
{
        int32_t error = 0, off, id_len, size, len;
        int8_t *dname = NULL, *uname = NULL;
        struct ud_inode *ip;
        struct fbuf *fbp = NULL;
        struct path_comp *pc;

        ud_printf("udf_readlink\n");

        if (vp->v_type != VLNK) {
                return (EINVAL);
        }

        ip = VTOI(vp);
        size = ip->i_size;
        if (size > PAGESIZE) {
                return (EIO);
        }

        if (size == 0) {
                return (0);
        }

        dname = kmem_zalloc(1024, KM_SLEEP);
        uname = kmem_zalloc(PAGESIZE, KM_SLEEP);

        rw_enter(&ip->i_contents, RW_READER);

        if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
                goto end;
        }

        off = 0;

        while (off < size) {
                pc = (struct path_comp *)(fbp->fb_addr + off);
                switch (pc->pc_type) {
                        case 1 :
                                (void) strcpy(uname, ip->i_udf->udf_fsmnt);
                                (void) strcat(uname, "/");
                                break;
                        case 2 :
                                if (pc->pc_len != 0) {
                                        goto end;
                                }
                                uname[0] = '/';
                                uname[1] = '\0';
                                break;
                        case 3 :
                                (void) strcat(uname, "../");
                                break;
                        case 4 :
                                (void) strcat(uname, "./");
                                break;
                        case 5 :
                                if ((error = ud_uncompress(pc->pc_len, &id_len,
                                    pc->pc_id, (uint8_t *)dname)) != 0) {
                                        break;
                                }
                                dname[id_len] = '\0';
                                (void) strcat(uname, dname);
                                (void) strcat(uname, "/");
                                break;
                        default :
                                error = EINVAL;
                                goto end;
                }
                off += 4 + pc->pc_len;
        }
        len = strlen(uname) - 1;
        if (uname[len] == '/') {
                if (len == 0) {
                        /*
                         * special case link to /
                         */
                        len = 1;
                } else {
                        uname[len] = '\0';
                }
        }

        error = uiomove(uname, len, UIO_READ, uiop);

        ITIMES(ip);

end:
        if (fbp != NULL) {
                fbrelse(fbp, S_OTHER);
        }
        rw_exit(&ip->i_contents);
        if (uname != NULL) {
                kmem_free(uname, PAGESIZE);
        }
        if (dname != NULL) {
                kmem_free(dname, 1024);
        }
        return (error);
}

/* ARGSUSED */
static int32_t
udf_fsync(
        struct vnode *vp,
        int32_t syncflag,
        struct cred *cr,
        caller_context_t *ct)
{
        int32_t error = 0;
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_fsync\n");

        rw_enter(&ip->i_contents, RW_WRITER);
        if (!(IS_SWAPVP(vp))) {
                error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
        }
        if (error == 0) {
                error = ud_sync_indir(ip);
        }
        ITIMES(ip);             /* XXX: is this necessary ??? */
        rw_exit(&ip->i_contents);

        return (error);
}

/* ARGSUSED */
static void
udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
{
        ud_printf("udf_iinactive\n");

        ud_iinactive(VTOI(vp), cr);
}

/* ARGSUSED */
static int32_t
udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
{
        struct udf_fid *udfidp;
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_fid\n");

        if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
                fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
                return (ENOSPC);
        }

        udfidp = (struct udf_fid *)fidp;
        bzero((char *)udfidp, sizeof (struct udf_fid));
        rw_enter(&ip->i_contents, RW_READER);
        udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
        udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
        udfidp->udfid_prn = ip->i_icb_prn;
        udfidp->udfid_icb_lbn = ip->i_icb_block;
        rw_exit(&ip->i_contents);

        return (0);
}

/* ARGSUSED2 */
static int
udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_rwlock\n");

        if (write_lock) {
                rw_enter(&ip->i_rwlock, RW_WRITER);
        } else {
                rw_enter(&ip->i_rwlock, RW_READER);
        }
#ifdef  __lock_lint
        rw_exit(&ip->i_rwlock);
#endif
        return (write_lock);
}

/* ARGSUSED */
static void
udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_rwunlock\n");

#ifdef  __lock_lint
        rw_enter(&ip->i_rwlock, RW_WRITER);
#endif

        rw_exit(&ip->i_rwlock);

}

/* ARGSUSED */
static int32_t
udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
{
        return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
}

static int32_t
udf_frlock(
        struct vnode *vp,
        int32_t cmd,
        struct flock64 *bfp,
        int32_t flag,
        offset_t offset,
        struct flk_callback *flk_cbp,
        cred_t *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_frlock\n");

        /*
         * If file is being mapped, disallow frlock.
         * XXX I am not holding tlock while checking i_mapcnt because the
         * current locking strategy drops all locks before calling fs_frlock.
         * So, mapcnt could change before we enter fs_frlock making is
         * meaningless to have held tlock in the first place.
         */
        if ((ip->i_mapcnt > 0) &&
            (MANDLOCK(vp, ip->i_char))) {
                return (EAGAIN);
        }

        return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
}

/*ARGSUSED6*/
static int32_t
udf_space(
        struct vnode *vp,
        int32_t cmd,
        struct flock64 *bfp,
        int32_t flag,
        offset_t offset,
        cred_t *cr,
        caller_context_t *ct)
{
        int32_t error = 0;

        ud_printf("udf_space\n");

        if (cmd != F_FREESP) {
                error =  EINVAL;
        } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
                error = ud_freesp(vp, bfp, flag, cr);

                if (error == 0 && bfp->l_start == 0)
                        vnevent_truncate(vp, ct);
        }

        return (error);
}

/* ARGSUSED */
static int32_t
udf_getpage(
        struct vnode *vp,
        offset_t off,
        size_t len,
        uint32_t *protp,
        struct page **plarr,
        size_t plsz,
        struct seg *seg,
        caddr_t addr,
        enum seg_rw rw,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);
        int32_t error, has_holes, beyond_eof, seqmode, dolock;
        int32_t pgsize = PAGESIZE;
        struct udf_vfs *udf_vfsp = ip->i_udf;
        page_t **pl;
        u_offset_t pgoff, eoff, uoff;
        krw_t rwtype;
        caddr_t pgaddr;

        ud_printf("udf_getpage\n");

        uoff = (u_offset_t)off; /* type conversion */
        if (protp) {
                *protp = PROT_ALL;
        }
        if (vp->v_flag & VNOMAP) {
                return (ENOSYS);
        }
        seqmode = ip->i_nextr == uoff && rw != S_CREATE;

        rwtype = RW_READER;
        dolock = (rw_owner(&ip->i_contents) != curthread);
retrylock:
#ifdef  __lock_lint
        rw_enter(&ip->i_contents, rwtype);
#else
        if (dolock) {
                rw_enter(&ip->i_contents, rwtype);
        }
#endif

        /*
         * We may be getting called as a side effect of a bmap using
         * fbread() when the blocks might be being allocated and the
         * size has not yet been up'ed.  In this case we want to be
         * able to return zero pages if we get back UDF_HOLE from
         * calling bmap for a non write case here.  We also might have
         * to read some frags from the disk into a page if we are
         * extending the number of frags for a given lbn in bmap().
         */
        beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
        if (beyond_eof && seg != segkmap) {
#ifdef  __lock_lint
                rw_exit(&ip->i_contents);
#else
                if (dolock) {
                        rw_exit(&ip->i_contents);
                }
#endif
                return (EFAULT);
        }

        /*
         * Must hold i_contents lock throughout the call to pvn_getpages
         * since locked pages are returned from each call to ud_getapage.
         * Must *not* return locked pages and then try for contents lock
         * due to lock ordering requirements (inode > page)
         */

        has_holes = ud_bmap_has_holes(ip);

        if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
                int32_t blk_size, count;
                u_offset_t offset;

                /*
                 * We must acquire the RW_WRITER lock in order to
                 * call bmap_write().
                 */
                if (dolock && rwtype == RW_READER) {
                        rwtype = RW_WRITER;

                        if (!rw_tryupgrade(&ip->i_contents)) {

                                rw_exit(&ip->i_contents);

                                goto retrylock;
                        }
                }

                /*
                 * May be allocating disk blocks for holes here as
                 * a result of mmap faults. write(2) does the bmap_write
                 * in rdip/wrip, not here. We are not dealing with frags
                 * in this case.
                 */
                offset = uoff;
                while ((offset < uoff + len) &&
                    (offset < ip->i_size)) {
                        /*
                         * the variable "bnp" is to simplify the expression for
                         * the compiler; * just passing in &bn to bmap_write
                         * causes a compiler "loop"
                         */

                        blk_size = udf_vfsp->udf_lbsize;
                        if ((offset + blk_size) > ip->i_size) {
                                count = ip->i_size - offset;
                        } else {
                                count = blk_size;
                        }
                        error = ud_bmap_write(ip, offset, count, 0, cr);
                        if (error) {
                                goto update_inode;
                        }
                        offset += count; /* XXX - make this contig */
                }
        }

        /*
         * Can be a reader from now on.
         */
#ifdef  __lock_lint
        if (rwtype == RW_WRITER) {
                rw_downgrade(&ip->i_contents);
        }
#else
        if (dolock && rwtype == RW_WRITER) {
                rw_downgrade(&ip->i_contents);
        }
#endif

        /*
         * We remove PROT_WRITE in cases when the file has UDF holes
         * because we don't  want to call bmap_read() to check each
         * page if it is backed with a disk block.
         */
        if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
                *protp &= ~PROT_WRITE;
        }

        error = 0;

        /*
         * The loop looks up pages in the range <off, off + len).
         * For each page, we first check if we should initiate an asynchronous
         * read ahead before we call page_lookup (we may sleep in page_lookup
         * for a previously initiated disk read).
         */
        eoff = (uoff + len);
        for (pgoff = uoff, pgaddr = addr, pl = plarr;
            pgoff < eoff; /* empty */) {
                page_t  *pp;
                u_offset_t      nextrio;
                se_t    se;

                se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);

                /*
                 * Handle async getpage (faultahead)
                 */
                if (plarr == NULL) {
                        ip->i_nextrio = pgoff;
                        ud_getpage_ra(vp, pgoff, seg, pgaddr);
                        pgoff += pgsize;
                        pgaddr += pgsize;
                        continue;
                }

                /*
                 * Check if we should initiate read ahead of next cluster.
                 * We call page_exists only when we need to confirm that
                 * we have the current page before we initiate the read ahead.
                 */
                nextrio = ip->i_nextrio;
                if (seqmode &&
                    pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
                    nextrio < ip->i_size && page_exists(vp, pgoff))
                        ud_getpage_ra(vp, pgoff, seg, pgaddr);

                if ((pp = page_lookup(vp, pgoff, se)) != NULL) {

                        /*
                         * We found the page in the page cache.
                         */
                        *pl++ = pp;
                        pgoff += pgsize;
                        pgaddr += pgsize;
                        len -= pgsize;
                        plsz -= pgsize;
                } else  {

                        /*
                         * We have to create the page, or read it from disk.
                         */
                        if (error = ud_getpage_miss(vp, pgoff, len,
                            seg, pgaddr, pl, plsz, rw, seqmode)) {
                                goto error_out;
                        }

                        while (*pl != NULL) {
                                pl++;
                                pgoff += pgsize;
                                pgaddr += pgsize;
                                len -= pgsize;
                                plsz -= pgsize;
                        }
                }
        }

        /*
         * Return pages up to plsz if they are in the page cache.
         * We cannot return pages if there is a chance that they are
         * backed with a UDF hole and rw is S_WRITE or S_CREATE.
         */
        if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {

                ASSERT((protp == NULL) ||
                    !(has_holes && (*protp & PROT_WRITE)));

                eoff = pgoff + plsz;
                while (pgoff < eoff) {
                        page_t          *pp;

                        if ((pp = page_lookup_nowait(vp, pgoff,
                            SE_SHARED)) == NULL)
                                break;

                        *pl++ = pp;
                        pgoff += pgsize;
                        plsz -= pgsize;
                }
        }

        if (plarr)
                *pl = NULL;                     /* Terminate page list */
        ip->i_nextr = pgoff;

error_out:
        if (error && plarr) {
                /*
                 * Release any pages we have locked.
                 */
                while (pl > &plarr[0])
                        page_unlock(*--pl);

                plarr[0] = NULL;
        }

update_inode:
#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#else
        if (dolock) {
                rw_exit(&ip->i_contents);
        }
#endif

        /*
         * If the inode is not already marked for IACC (in rwip() for read)
         * and the inode is not marked for no access time update (in rwip()
         * for write) then update the inode access time and mod time now.
         */
        mutex_enter(&ip->i_tlock);
        if ((ip->i_flag & (IACC | INOACC)) == 0) {
                if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
                        ip->i_flag |= IACC;
                }
                if (rw == S_WRITE) {
                        ip->i_flag |= IUPD;
                }
                ITIMES_NOLOCK(ip);
        }
        mutex_exit(&ip->i_tlock);

        return (error);
}

int32_t ud_delay = 1;

/* ARGSUSED */
static int32_t
udf_putpage(
        struct vnode *vp,
        offset_t off,
        size_t len,
        int32_t flags,
        struct cred *cr,
        caller_context_t *ct)
{
        struct ud_inode *ip;
        int32_t error = 0;

        ud_printf("udf_putpage\n");

        ip = VTOI(vp);
#ifdef  __lock_lint
        rw_enter(&ip->i_contents, RW_WRITER);
#endif

        if (vp->v_count == 0) {
                cmn_err(CE_WARN, "ud_putpage : bad v_count");
                error = EINVAL;
                goto out;
        }

        if (vp->v_flag & VNOMAP) {
                error = ENOSYS;
                goto out;
        }

        if (flags & B_ASYNC) {
                if (ud_delay && len &&
                    (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
                        mutex_enter(&ip->i_tlock);

                        /*
                         * If nobody stalled, start a new cluster.
                         */
                        if (ip->i_delaylen == 0) {
                                ip->i_delayoff = off;
                                ip->i_delaylen = len;
                                mutex_exit(&ip->i_tlock);
                                goto out;
                        }

                        /*
                         * If we have a full cluster or they are not contig,
                         * then push last cluster and start over.
                         */
                        if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
                            ip->i_delayoff + ip->i_delaylen != off) {
                                u_offset_t doff;
                                size_t dlen;

                                doff = ip->i_delayoff;
                                dlen = ip->i_delaylen;
                                ip->i_delayoff = off;
                                ip->i_delaylen = len;
                                mutex_exit(&ip->i_tlock);
                                error = ud_putpages(vp, doff, dlen, flags, cr);
                                /* LMXXX - flags are new val, not old */
                                goto out;
                        }

                        /*
                         * There is something there, it's not full, and
                         * it is contig.
                         */
                        ip->i_delaylen += len;
                        mutex_exit(&ip->i_tlock);
                        goto out;
                }

                /*
                 * Must have weird flags or we are not clustering.
                 */
        }

        error = ud_putpages(vp, off, len, flags, cr);

out:
#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#endif
        return (error);
}

/* ARGSUSED */
static int32_t
udf_map(
        struct vnode *vp,
        offset_t off,
        struct as *as,
        caddr_t *addrp,
        size_t len,
        uint8_t prot,
        uint8_t maxprot,
        uint32_t flags,
        struct cred *cr,
        caller_context_t *ct)
{
        struct segvn_crargs vn_a;
        int32_t error = 0;

        ud_printf("udf_map\n");

        if (vp->v_flag & VNOMAP) {
                error = ENOSYS;
                goto end;
        }

        if ((off < (offset_t)0) ||
            ((off + len) < (offset_t)0)) {
                error = EINVAL;
                goto end;
        }

        if (vp->v_type != VREG) {
                error = ENODEV;
                goto end;
        }

        /*
         * If file is being locked, disallow mapping.
         */
        if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
                error = EAGAIN;
                goto end;
        }

        as_rangelock(as);
        error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
        if (error != 0) {
                as_rangeunlock(as);
                goto end;
        }

        vn_a.vp = vp;
        vn_a.offset = off;
        vn_a.type = flags & MAP_TYPE;
        vn_a.prot = prot;
        vn_a.maxprot = maxprot;
        vn_a.cred = cr;
        vn_a.amp = NULL;
        vn_a.flags = flags & ~MAP_TYPE;
        vn_a.szc = 0;
        vn_a.lgrp_mem_policy_flags = 0;

        error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
        as_rangeunlock(as);

end:
        return (error);
}

/* ARGSUSED */
static int32_t
udf_addmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
    size_t len, uint8_t prot, uint8_t maxprot, uint32_t flags,
    struct cred *cr, caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_addmap\n");

        if (vp->v_flag & VNOMAP) {
                return (ENOSYS);
        }

        mutex_enter(&ip->i_tlock);
        ip->i_mapcnt += btopr(len);
        mutex_exit(&ip->i_tlock);

        return (0);
}

/* ARGSUSED */
static int32_t
udf_delmap(struct vnode *vp, offset_t off, struct as *as, caddr_t addr,
    size_t len, uint32_t prot, uint32_t maxprot, uint32_t flags,
    struct cred *cr, caller_context_t *ct)
{
        struct ud_inode *ip = VTOI(vp);

        ud_printf("udf_delmap\n");

        if (vp->v_flag & VNOMAP) {
                return (ENOSYS);
        }

        mutex_enter(&ip->i_tlock);
        ip->i_mapcnt -= btopr(len);     /* Count released mappings */
        ASSERT(ip->i_mapcnt >= 0);
        mutex_exit(&ip->i_tlock);

        return (0);
}

/* ARGSUSED */
static int32_t
udf_l_pathconf(struct vnode *vp, int32_t cmd, ulong_t *valp, struct cred *cr,
    caller_context_t *ct)
{
        int32_t error = 0;

        ud_printf("udf_l_pathconf\n");

        if (cmd == _PC_FILESIZEBITS) {
                /*
                 * udf supports 64 bits as file size
                 * but there are several other restrictions
                 * it only supports 32-bit block numbers and
                 * daddr32_t is only and int32_t so taking these
                 * into account we can stay just as where ufs is
                 */
                *valp = 41;
        } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
                /* nanosecond timestamp resolution */
                *valp = 1L;
        } else {
                error = fs_pathconf(vp, cmd, valp, cr, ct);
        }

        return (error);
}

uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
#ifndef __lint
_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
_NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
#endif
/*
 * Assumption is that there will not be a pageio request
 * to a enbedded file
 */
/* ARGSUSED */
static int32_t
udf_pageio(
        struct vnode *vp,
        struct page *pp,
        u_offset_t io_off,
        size_t io_len,
        int32_t flags,
        struct cred *cr,
        caller_context_t *ct)
{
        daddr_t bn;
        struct buf *bp;
        struct ud_inode *ip = VTOI(vp);
        int32_t dolock, error = 0, contig, multi_io;
        size_t done_len = 0, cur_len = 0;
        page_t *npp = NULL, *opp = NULL, *cpp = pp;

        if (pp == NULL) {
                return (EINVAL);
        }

        dolock = (rw_owner(&ip->i_contents) != curthread);

        /*
         * We need a better check.  Ideally, we would use another
         * vnodeops so that hlocked and forcibly unmounted file
         * systems would return EIO where appropriate and w/o the
         * need for these checks.
         */
        if (ip->i_udf == NULL) {
                return (EIO);
        }

#ifdef  __lock_lint
        rw_enter(&ip->i_contents, RW_READER);
#else
        if (dolock) {
                rw_enter(&ip->i_contents, RW_READER);
        }
#endif

        /*
         * Break the io request into chunks, one for each contiguous
         * stretch of disk blocks in the target file.
         */
        while (done_len < io_len) {
                ASSERT(cpp);
                bp = NULL;
                contig = 0;
                if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
                    &bn, &contig)) {
                        break;
                }

                if (bn == UDF_HOLE) {   /* No holey swapfiles */
                        cmn_err(CE_WARN, "SWAP file has HOLES");
                        error = EINVAL;
                        break;
                }

                cur_len = MIN(io_len - done_len, contig);

                /*
                 * Check if more than one I/O is
                 * required to complete the given
                 * I/O operation
                 */
                if (ip->i_udf->udf_lbsize < PAGESIZE) {
                        if (cur_len >= PAGESIZE) {
                                multi_io = 0;
                                cur_len &= PAGEMASK;
                        } else {
                                multi_io = 1;
                                cur_len = MIN(io_len - done_len, PAGESIZE);
                        }
                }
                page_list_break(&cpp, &npp, btop(cur_len));

                bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
                ASSERT(bp != NULL);

                bp->b_edev = ip->i_dev;
                bp->b_dev = cmpdev(ip->i_dev);
                bp->b_blkno = bn;
                bp->b_un.b_addr = (caddr_t)0;
                bp->b_file = vp;
                bp->b_offset = (offset_t)(io_off + done_len);

/*
 *              ub.ub_pageios.value.ul++;
 */
                if (multi_io == 0) {
                        (void) bdev_strategy(bp);
                } else {
                        error = ud_multi_strat(ip, cpp, bp,
                            (u_offset_t)(io_off + done_len));
                        if (error != 0) {
                                pageio_done(bp);
                                break;
                        }
                }
                if (flags & B_READ) {
                        ud_pageio_reads++;
                } else {
                        ud_pageio_writes++;
                }

                /*
                 * If the request is not B_ASYNC, wait for i/o to complete
                 * and re-assemble the page list to return to the caller.
                 * If it is B_ASYNC we leave the page list in pieces and
                 * cleanup() will dispose of them.
                 */
                if ((flags & B_ASYNC) == 0) {
                        error = biowait(bp);
                        pageio_done(bp);
                        if (error) {
                                break;
                        }
                        page_list_concat(&opp, &cpp);
                }
                cpp = npp;
                npp = NULL;
                done_len += cur_len;
        }

        ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
        if (error) {
                if (flags & B_ASYNC) {
                        /* Cleanup unprocessed parts of list */
                        page_list_concat(&cpp, &npp);
                        if (flags & B_READ) {
                                pvn_read_done(cpp, B_ERROR);
                        } else {
                                pvn_write_done(cpp, B_ERROR);
                        }
                } else {
                        /* Re-assemble list and let caller clean up */
                        page_list_concat(&opp, &cpp);
                        page_list_concat(&opp, &npp);
                }
        }

#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#else
        if (dolock) {
                rw_exit(&ip->i_contents);
        }
#endif
        return (error);
}




/* -------------------- local functions --------------------------- */



int32_t
ud_rdwri(enum uio_rw rw, int32_t ioflag, struct ud_inode *ip, caddr_t base,
    int32_t len, offset_t offset, enum uio_seg seg, int32_t *aresid,
    struct cred *cr)
{
        int32_t error;
        struct uio auio;
        struct iovec aiov;

        ud_printf("ud_rdwri\n");

        bzero((caddr_t)&auio, sizeof (uio_t));
        bzero((caddr_t)&aiov, sizeof (iovec_t));

        aiov.iov_base = base;
        aiov.iov_len = len;
        auio.uio_iov = &aiov;
        auio.uio_iovcnt = 1;
        auio.uio_loffset = offset;
        auio.uio_segflg = (int16_t)seg;
        auio.uio_resid = len;

        if (rw == UIO_WRITE) {
                auio.uio_fmode = FWRITE;
                auio.uio_extflg = UIO_COPY_DEFAULT;
                auio.uio_llimit = curproc->p_fsz_ctl;
                error = ud_wrip(ip, &auio, ioflag, cr);
        } else {
                auio.uio_fmode = FREAD;
                auio.uio_extflg = UIO_COPY_CACHED;
                auio.uio_llimit = MAXOFFSET_T;
                error = ud_rdip(ip, &auio, ioflag, cr);
        }

        if (aresid) {
                *aresid = auio.uio_resid;
        } else if (auio.uio_resid) {
                error = EIO;
        }
        return (error);
}

/*
 * Free behind hacks.  The pager is busted.
 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
 * or B_FREE_IF_TIGHT_ON_MEMORY.
 */
int32_t ud_freebehind = 1;
int32_t ud_smallfile = 32 * 1024;

/* ARGSUSED */
int32_t
ud_getpage_miss(struct vnode *vp, u_offset_t off, size_t len, struct seg *seg,
    caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw, int32_t seq)
{
        struct ud_inode *ip = VTOI(vp);
        int32_t err = 0;
        size_t io_len;
        u_offset_t io_off;
        u_offset_t pgoff;
        page_t *pp;

        pl[0] = NULL;

        /*
         * Figure out whether the page can be created, or must be
         * read from the disk
         */
        if (rw == S_CREATE) {
                if ((pp = page_create_va(vp, off,
                    PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
                        cmn_err(CE_WARN, "ud_getpage_miss: page_create");
                        return (EINVAL);
                }
                io_len = PAGESIZE;
        } else {
                pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
                    &io_len, off, PAGESIZE, 0);

                /*
                 * Some other thread has entered the page.
                 * ud_getpage will retry page_lookup.
                 */
                if (pp == NULL) {
                        return (0);
                }

                /*
                 * Fill the page with as much data as we can from the file.
                 */
                err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
                if (err) {
                        pvn_read_done(pp, B_ERROR);
                        return (err);
                }

                /*
                 * XXX ??? ufs has io_len instead of pgoff below
                 */
                ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);

                /*
                 * If the file access is sequential, initiate read ahead
                 * of the next cluster.
                 */
                if (seq && ip->i_nextrio < ip->i_size) {
                        ud_getpage_ra(vp, off, seg, addr);
                }
        }

        pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
        return (err);
}

/* ARGSUSED */
void
ud_getpage_ra(struct vnode *vp, u_offset_t off, struct seg *seg, caddr_t addr)
{
        page_t *pp;
        size_t io_len;
        struct ud_inode *ip = VTOI(vp);
        u_offset_t io_off = ip->i_nextrio, pgoff;
        caddr_t addr2 = addr + (io_off - off);
        daddr_t bn;
        int32_t contig = 0;

        /*
         * Is this test needed?
         */

        if (addr2 >= seg->s_base + seg->s_size) {
                return;
        }

        contig = 0;
        if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
                return;
        }

        pp = pvn_read_kluster(vp, io_off, seg, addr2,
            &io_off, &io_len, io_off, PAGESIZE, 1);

        /*
         * Some other thread has entered the page.
         * So no read head done here (ie we will have to and wait
         * for the read when needed).
         */

        if (pp == NULL) {
                return;
        }

        (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
        ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
}

int
ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off, uint32_t bflgs,
    u_offset_t *pg_off)
{
        daddr_t bn;
        struct buf *bp;
        caddr_t kaddr, caddr;
        int32_t error = 0, contig = 0, multi_io = 0;
        int32_t lbsize = ip->i_udf->udf_lbsize;
        int32_t lbmask = ip->i_udf->udf_lbmask;
        uint64_t isize;

        isize = (ip->i_size + lbmask) & (~lbmask);
        if (ip->i_desc_type == ICB_FLAG_ONE_AD) {

                /*
                 * Embedded file read file_entry
                 * from buffer cache and copy the required
                 * portions
                 */
                bp = ud_bread(ip->i_dev,
                    ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
                if ((bp->b_error == 0) &&
                    (bp->b_resid == 0)) {

                        caddr = bp->b_un.b_addr + ip->i_data_off;

                        /*
                         * mapin to kvm
                         */
                        kaddr = (caddr_t)ppmapin(pp,
                            PROT_READ | PROT_WRITE, (caddr_t)-1);
                        (void) kcopy(caddr, kaddr, ip->i_size);

                        /*
                         * mapout of kvm
                         */
                        ppmapout(kaddr);
                }
                brelse(bp);
                contig = ip->i_size;
        } else {

                /*
                 * Get the continuous size and block number
                 * at offset "off"
                 */
                if (error = ud_bmap_read(ip, off, &bn, &contig))
                        goto out;
                contig = MIN(contig, PAGESIZE);
                contig = (contig + lbmask) & (~lbmask);

                /*
                 * Zero part of the page which we are not
                 * going to read from the disk.
                 */

                if (bn == UDF_HOLE) {

                        /*
                         * This is a HOLE. Just zero out
                         * the page
                         */
                        if (((off + contig) == isize) ||
                            (contig == PAGESIZE)) {
                                pagezero(pp->p_prev, 0, PAGESIZE);
                                goto out;
                        }
                }

                if (contig < PAGESIZE) {
                        uint64_t count;

                        count = isize - off;
                        if (contig != count) {
                                multi_io = 1;
                                contig = (int32_t)(MIN(count, PAGESIZE));
                        } else {
                                pagezero(pp->p_prev, contig, PAGESIZE - contig);
                        }
                }

                /*
                 * Get a bp and initialize it
                 */
                bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
                ASSERT(bp != NULL);

                bp->b_edev = ip->i_dev;
                bp->b_dev = cmpdev(ip->i_dev);
                bp->b_blkno = bn;
                bp->b_un.b_addr = 0;
                bp->b_file = ip->i_vnode;

                /*
                 * Start I/O
                 */
                if (multi_io == 0) {

                        /*
                         * Single I/O is sufficient for this page
                         */
                        (void) bdev_strategy(bp);
                } else {

                        /*
                         * We need to do the I/O in
                         * piece's
                         */
                        error = ud_multi_strat(ip, pp, bp, off);
                        if (error != 0) {
                                goto out;
                        }
                }
                if ((bflgs & B_ASYNC) == 0) {

                        /*
                         * Wait for i/o to complete.
                         */

                        error = biowait(bp);
                        pageio_done(bp);
                        if (error) {
                                goto out;
                        }
                }
        }
        if ((off + contig) >= ip->i_size) {
                contig = ip->i_size - off;
        }

out:
        *pg_off = contig;
        return (error);
}

int32_t
ud_putpages(struct vnode *vp, offset_t off, size_t len, int32_t flags,
    struct cred *cr)
{
        struct ud_inode *ip;
        page_t *pp;
        u_offset_t io_off;
        size_t io_len;
        u_offset_t eoff;
        int32_t err = 0;
        int32_t dolock;

        ud_printf("ud_putpages\n");

        if (vp->v_count == 0) {
                cmn_err(CE_WARN, "ud_putpages: bad v_count");
                return (EINVAL);
        }

        ip = VTOI(vp);

        /*
         * Acquire the readers/write inode lock before locking
         * any pages in this inode.
         * The inode lock is held during i/o.
         */
        if (len == 0) {
                mutex_enter(&ip->i_tlock);
                ip->i_delayoff = ip->i_delaylen = 0;
                mutex_exit(&ip->i_tlock);
        }
#ifdef  __lock_lint
        rw_enter(&ip->i_contents, RW_READER);
#else
        dolock = (rw_owner(&ip->i_contents) != curthread);
        if (dolock) {
                rw_enter(&ip->i_contents, RW_READER);
        }
#endif

        if (!vn_has_cached_data(vp)) {
#ifdef  __lock_lint
                rw_exit(&ip->i_contents);
#else
                if (dolock) {
                        rw_exit(&ip->i_contents);
                }
#endif
                return (0);
        }

        if (len == 0) {
                /*
                 * Search the entire vp list for pages >= off.
                 */
                err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
                    flags, cr);
        } else {
                /*
                 * Loop over all offsets in the range looking for
                 * pages to deal with.
                 */
                if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
                        eoff = MIN(off + len, eoff);
                } else {
                        eoff = off + len;
                }

                for (io_off = off; io_off < eoff; io_off += io_len) {
                        /*
                         * If we are not invalidating, synchronously
                         * freeing or writing pages, use the routine
                         * page_lookup_nowait() to prevent reclaiming
                         * them from the free list.
                         */
                        if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
                                pp = page_lookup(vp, io_off,
                                    (flags & (B_INVAL | B_FREE)) ?
                                    SE_EXCL : SE_SHARED);
                        } else {
                                pp = page_lookup_nowait(vp, io_off,
                                    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
                        }

                        if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
                                io_len = PAGESIZE;
                        } else {

                                err = ud_putapage(vp, pp,
                                    &io_off, &io_len, flags, cr);
                                if (err != 0) {
                                        break;
                                }
                                /*
                                 * "io_off" and "io_len" are returned as
                                 * the range of pages we actually wrote.
                                 * This allows us to skip ahead more quickly
                                 * since several pages may've been dealt
                                 * with by this iteration of the loop.
                                 */
                        }
                }
        }
        if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
                /*
                 * We have just sync'ed back all the pages on
                 * the inode, turn off the IMODTIME flag.
                 */
                mutex_enter(&ip->i_tlock);
                ip->i_flag &= ~IMODTIME;
                mutex_exit(&ip->i_tlock);
        }
#ifdef  __lock_lint
        rw_exit(&ip->i_contents);
#else
        if (dolock) {
                rw_exit(&ip->i_contents);
        }
#endif
        return (err);
}

/* ARGSUSED */
int32_t
ud_putapage(struct vnode *vp, page_t *pp, u_offset_t *offp,
    size_t *lenp, int32_t flags, struct cred *cr)
{
        daddr_t bn;
        size_t io_len;
        struct ud_inode *ip;
        int32_t error = 0, contig, multi_io = 0;
        struct udf_vfs *udf_vfsp;
        u_offset_t off, io_off;
        caddr_t kaddr, caddr;
        struct buf *bp = NULL;
        int32_t lbmask;
        uint64_t isize;
        uint16_t crc_len;
        struct file_entry *fe;

        ud_printf("ud_putapage\n");

        ip = VTOI(vp);
        ASSERT(ip);
        ASSERT(RW_LOCK_HELD(&ip->i_contents));
        lbmask = ip->i_udf->udf_lbmask;
        isize = (ip->i_size + lbmask) & (~lbmask);

        udf_vfsp = ip->i_udf;
        ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);

        /*
         * If the modified time on the inode has not already been
         * set elsewhere (e.g. for write/setattr) we set the time now.
         * This gives us approximate modified times for mmap'ed files
         * which are modified via stores in the user address space.
         */
        if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
                mutex_enter(&ip->i_tlock);
                ip->i_flag |= IUPD;
                ITIMES_NOLOCK(ip);
                mutex_exit(&ip->i_tlock);
        }


        /*
         * Align the request to a block boundry (for old file systems),
         * and go ask bmap() how contiguous things are for this file.
         */
        off = pp->p_offset & ~(offset_t)lbmask;
                                /* block align it */


        if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
                ASSERT(ip->i_size <= ip->i_max_emb);

                pp = pvn_write_kluster(vp, pp, &io_off,
                    &io_len, off, PAGESIZE, flags);
                if (io_len == 0) {
                        io_len = PAGESIZE;
                }

                bp = ud_bread(ip->i_dev,
                    ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
                    udf_vfsp->udf_lbsize);
                fe = (struct file_entry *)bp->b_un.b_addr;
                if ((bp->b_flags & B_ERROR) ||
                    (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
                    ip->i_icb_block,
                    1, udf_vfsp->udf_lbsize) != 0)) {
                        if (pp != NULL)
                                pvn_write_done(pp, B_ERROR | B_WRITE | flags);
                        if (bp->b_flags & B_ERROR) {
                                error = EIO;
                        } else {
                                error = EINVAL;
                        }
                        brelse(bp);
                        return (error);
                }
                if ((bp->b_error == 0) &&
                    (bp->b_resid == 0)) {

                        caddr = bp->b_un.b_addr + ip->i_data_off;
                        kaddr = (caddr_t)ppmapin(pp,
                            PROT_READ | PROT_WRITE, (caddr_t)-1);
                        (void) kcopy(kaddr, caddr, ip->i_size);
                        ppmapout(kaddr);
                }
                crc_len = offsetof(struct file_entry, fe_spec) +
                    SWAP_32(fe->fe_len_ear);
                crc_len += ip->i_size;
                ud_make_tag(ip->i_udf, &fe->fe_tag,
                    UD_FILE_ENTRY, ip->i_icb_block, crc_len);

                bwrite(bp);

                if (flags & B_ASYNC) {
                        pvn_write_done(pp, flags);
                }
                contig = ip->i_size;
        } else {

                if (error = ud_bmap_read(ip, off, &bn, &contig)) {
                        goto out;
                }
                contig = MIN(contig, PAGESIZE);
                contig = (contig + lbmask) & (~lbmask);

                if (contig < PAGESIZE) {
                        uint64_t count;

                        count = isize - off;
                        if (contig != count) {
                                multi_io = 1;
                                contig = (int32_t)(MIN(count, PAGESIZE));
                        }
                }

                if ((off + contig) > isize) {
                        contig = isize - off;
                }

                if (contig > PAGESIZE) {
                        if (contig & PAGEOFFSET) {
                                contig &= PAGEMASK;
                        }
                }

                pp = pvn_write_kluster(vp, pp, &io_off,
                    &io_len, off, contig, flags);
                if (io_len == 0) {
                        io_len = PAGESIZE;
                }

                bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
                ASSERT(bp != NULL);

                bp->b_edev = ip->i_dev;
                bp->b_dev = cmpdev(ip->i_dev);
                bp->b_blkno = bn;
                bp->b_un.b_addr = 0;
                bp->b_file = vp;
                bp->b_offset = (offset_t)off;


                /*
                 * write throttle
                 */
                ASSERT(bp->b_iodone == NULL);
                bp->b_iodone = ud_iodone;
                mutex_enter(&ip->i_tlock);
                ip->i_writes += bp->b_bcount;
                mutex_exit(&ip->i_tlock);

                if (multi_io == 0) {

                        (void) bdev_strategy(bp);
                } else {
                        error = ud_multi_strat(ip, pp, bp, off);
                        if (error != 0) {
                                goto out;
                        }
                }

                if ((flags & B_ASYNC) == 0) {
                        /*
                         * Wait for i/o to complete.
                         */
                        error = biowait(bp);
                        pageio_done(bp);
                }
        }

        if ((flags & B_ASYNC) == 0) {
                pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
        }

        pp = NULL;

out:
        if (error != 0 && pp != NULL) {
                pvn_write_done(pp, B_ERROR | B_WRITE | flags);
        }

        if (offp) {
                *offp = io_off;
        }
        if (lenp) {
                *lenp = io_len;
        }

        return (error);
}


int32_t
ud_iodone(struct buf *bp)
{
        struct ud_inode *ip;

        ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));

        bp->b_iodone = NULL;

        ip = VTOI(bp->b_pages->p_vnode);

        mutex_enter(&ip->i_tlock);
        if (ip->i_writes >= ud_LW) {
                if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
                        if (ud_WRITES) {
                                cv_broadcast(&ip->i_wrcv); /* wake all up */
                        }
                }
        } else {
                ip->i_writes -= bp->b_bcount;
        }
        mutex_exit(&ip->i_tlock);
        iodone(bp);
        return (0);
}

/* ARGSUSED3 */
int32_t
ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
{
        struct vnode *vp;
        struct udf_vfs *udf_vfsp;
        krw_t rwtype;
        caddr_t base;
        uint32_t flags;
        int32_t error, n, on, mapon, dofree;
        u_offset_t off;
        long oresid = uio->uio_resid;

        ASSERT(RW_LOCK_HELD(&ip->i_contents));
        if ((ip->i_type != VREG) &&
            (ip->i_type != VDIR) &&
            (ip->i_type != VLNK)) {
                return (EIO);
        }

        if (uio->uio_loffset > MAXOFFSET_T) {
                return (0);
        }

        if ((uio->uio_loffset < (offset_t)0) ||
            ((uio->uio_loffset + uio->uio_resid) < 0)) {
                return (EINVAL);
        }
        if (uio->uio_resid == 0) {
                return (0);
        }

        vp = ITOV(ip);
        udf_vfsp = ip->i_udf;
        mutex_enter(&ip->i_tlock);
        ip->i_flag |= IACC;
        mutex_exit(&ip->i_tlock);

        rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);

        do {
                offset_t diff;
                u_offset_t uoff = uio->uio_loffset;
                off = uoff & (offset_t)MAXBMASK;
                mapon = (int)(uoff & (offset_t)MAXBOFFSET);
                on = (int)blkoff(udf_vfsp, uoff);
                n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);

                diff = ip->i_size - uoff;

                if (diff <= (offset_t)0) {
                        error = 0;
                        goto out;
                }
                if (diff < (offset_t)n) {
                        n = (int)diff;
                }
                dofree = ud_freebehind &&
                    ip->i_nextr == (off & PAGEMASK) &&
                    off > ud_smallfile;

#ifndef __lock_lint
                if (rwtype == RW_READER) {
                        rw_exit(&ip->i_contents);
                }
#endif

                base = segmap_getmapflt(segkmap, vp, (off + mapon),
                    (uint32_t)n, 1, S_READ);
                error = uiomove(base + mapon, (long)n, UIO_READ, uio);

                flags = 0;
                if (!error) {
                        /*
                         * If read a whole block, or read to eof,
                         * won't need this buffer again soon.
                         */
                        if (n + on == MAXBSIZE && ud_freebehind && dofree &&
                            freemem < lotsfree + pages_before_pager) {
                                flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
                        }
                        /*
                         * In POSIX SYNC (FSYNC and FDSYNC) read mode,
                         * we want to make sure that the page which has
                         * been read, is written on disk if it is dirty.
                         * And corresponding indirect blocks should also
                         * be flushed out.
                         */
                        if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
                                flags &= ~SM_ASYNC;
                                flags |= SM_WRITE;
                        }
                        error = segmap_release(segkmap, base, flags);
                } else    {
                        (void) segmap_release(segkmap, base, flags);
                }

#ifndef __lock_lint
                if (rwtype == RW_READER) {
                        rw_enter(&ip->i_contents, rwtype);
                }
#endif
        } while (error == 0 && uio->uio_resid > 0 && n != 0);
out:
        /*
         * Inode is updated according to this table if FRSYNC is set.
         *
         *      FSYNC   FDSYNC(posix.4)
         *      --------------------------
         *      always  IATTCHG|IBDWRITE
         */
        if (ioflag & FRSYNC) {
                if ((ioflag & FSYNC) ||
                    ((ioflag & FDSYNC) &&
                    (ip->i_flag & (IATTCHG|IBDWRITE)))) {
                rw_exit(&ip->i_contents);
                rw_enter(&ip->i_contents, RW_WRITER);
                ud_iupdat(ip, 1);
                }
        }
        /*
         * If we've already done a partial read, terminate
         * the read but return no error.
         */
        if (oresid != uio->uio_resid) {
                error = 0;
        }
        ITIMES(ip);

        return (error);
}

int32_t
ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
{
        caddr_t base;
        struct vnode *vp;
        struct udf_vfs *udf_vfsp;
        uint32_t flags;
        int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
        int32_t pagecreate, newpage;
        uint64_t old_i_size;
        u_offset_t off;
        long start_resid = uio->uio_resid, premove_resid;
        rlim64_t limit = uio->uio_limit;


        ASSERT(RW_WRITE_HELD(&ip->i_contents));
        if ((ip->i_type != VREG) &&
            (ip->i_type != VDIR) &&
            (ip->i_type != VLNK)) {
                return (EIO);
        }

        if (uio->uio_loffset >= MAXOFFSET_T) {
                return (EFBIG);
        }
        /*
         * see udf_l_pathconf
         */
        if (limit > (((uint64_t)1 << 40) - 1)) {
                limit = ((uint64_t)1 << 40) - 1;
        }
        if (uio->uio_loffset >= limit) {
                proc_t *p = ttoproc(curthread);

                mutex_enter(&p->p_lock);
                (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
                    p, RCA_UNSAFE_SIGINFO);
                mutex_exit(&p->p_lock);
                return (EFBIG);
        }
        if ((uio->uio_loffset < (offset_t)0) ||
            ((uio->uio_loffset + uio->uio_resid) < 0)) {
                return (EINVAL);
        }
        if (uio->uio_resid == 0) {
                return (0);
        }

        mutex_enter(&ip->i_tlock);
        ip->i_flag |= INOACC;

        if (ioflag & (FSYNC | FDSYNC)) {
                ip->i_flag |= ISYNC;
                iupdat_flag = 1;
        }
        mutex_exit(&ip->i_tlock);

        udf_vfsp = ip->i_udf;
        vp = ITOV(ip);

        do {
                u_offset_t uoff = uio->uio_loffset;
                off = uoff & (offset_t)MAXBMASK;
                mapon = (int)(uoff & (offset_t)MAXBOFFSET);
                on = (int)blkoff(udf_vfsp, uoff);
                n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);

                if (ip->i_type == VREG && uoff + n >= limit) {
                        if (uoff >= limit) {
                                error = EFBIG;
                                goto out;
                        }
                        n = (int)(limit - (rlim64_t)uoff);
                }
                if (uoff + n > ip->i_size) {
                        /*
                         * We are extending the length of the file.
                         * bmap is used so that we are sure that
                         * if we need to allocate new blocks, that it
                         * is done here before we up the file size.
                         */
                        error = ud_bmap_write(ip, uoff,
                            (int)(on + n), mapon == 0, cr);
                        if (error) {
                                break;
                        }
                        i_size_changed = 1;
                        old_i_size = ip->i_size;
                        ip->i_size = uoff + n;
                        /*
                         * If we are writing from the beginning of
                         * the mapping, we can just create the
                         * pages without having to read them.
                         */
                        pagecreate = (mapon == 0);
                } else if (n == MAXBSIZE) {
                        /*
                         * Going to do a whole mappings worth,
                         * so we can just create the pages w/o
                         * having to read them in.  But before
                         * we do that, we need to make sure any
                         * needed blocks are allocated first.
                         */
                        error = ud_bmap_write(ip, uoff,
                            (int)(on + n), 1, cr);
                        if (error) {
                                break;
                        }
                        pagecreate = 1;
                } else {
                        pagecreate = 0;
                }

                rw_exit(&ip->i_contents);

                /*
                 * Touch the page and fault it in if it is not in
                 * core before segmap_getmapflt can lock it. This
                 * is to avoid the deadlock if the buffer is mapped
                 * to the same file through mmap which we want to
                 * write to.
                 */
                uio_prefaultpages((long)n, uio);

                base = segmap_getmapflt(segkmap, vp, (off + mapon),
                    (uint32_t)n, !pagecreate, S_WRITE);

                /*
                 * segmap_pagecreate() returns 1 if it calls
                 * page_create_va() to allocate any pages.
                 */
                newpage = 0;
                if (pagecreate) {
                        newpage = segmap_pagecreate(segkmap, base,
                            (size_t)n, 0);
                }

                premove_resid = uio->uio_resid;
                error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);

                if (pagecreate &&
                    uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
                        /*
                         * We created pages w/o initializing them completely,
                         * thus we need to zero the part that wasn't set up.
                         * This happens on most EOF write cases and if
                         * we had some sort of error during the uiomove.
                         */
                        int nzero, nmoved;

                        nmoved = (int)(uio->uio_loffset - (off + mapon));
                        ASSERT(nmoved >= 0 && nmoved <= n);
                        nzero = roundup(on + n, PAGESIZE) - nmoved;
                        ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
                        (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
                }

                /*
                 * Unlock the pages allocated by page_create_va()
                 * in segmap_pagecreate()
                 */
                if (newpage) {
                        segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
                }

                if (error) {
                        /*
                         * If we failed on a write, we may have already
                         * allocated file blocks as well as pages.  It's
                         * hard to undo the block allocation, but we must
                         * be sure to invalidate any pages that may have
                         * been allocated.
                         */
                        (void) segmap_release(segkmap, base, SM_INVAL);
                } else {
                        flags = 0;
                        /*
                         * Force write back for synchronous write cases.
                         */
                        if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
                                /*
                                 * If the sticky bit is set but the
                                 * execute bit is not set, we do a
                                 * synchronous write back and free
                                 * the page when done.  We set up swap
                                 * files to be handled this way to
                                 * prevent servers from keeping around
                                 * the client's swap pages too long.
                                 * XXX - there ought to be a better way.
                                 */
                                if (IS_SWAPVP(vp)) {
                                        flags = SM_WRITE | SM_FREE |
                                            SM_DONTNEED;
                                        iupdat_flag = 0;
                                } else {
                                        flags = SM_WRITE;
                                }
                        } else if (((mapon + n) == MAXBSIZE) ||
                            IS_SWAPVP(vp)) {
                                /*
                                 * Have written a whole block.
                                 * Start an asynchronous write and
                                 * mark the buffer to indicate that
                                 * it won't be needed again soon.
                                 */
                                flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
                        }
                        error = segmap_release(segkmap, base, flags);

                        /*
                         * If the operation failed and is synchronous,
                         * then we need to unwind what uiomove() last
                         * did so we can potentially return an error to
                         * the caller.  If this write operation was
                         * done in two pieces and the first succeeded,
                         * then we won't return an error for the second
                         * piece that failed.  However, we only want to
                         * return a resid value that reflects what was
                         * really done.
                         *
                         * Failures for non-synchronous operations can
                         * be ignored since the page subsystem will
                         * retry the operation until it succeeds or the
                         * file system is unmounted.
                         */
                        if (error) {
                                if ((ioflag & (FSYNC | FDSYNC)) ||
                                    ip->i_type == VDIR) {
                                        uio->uio_resid = premove_resid;
                                } else {
                                        error = 0;
                                }
                        }
                }

                /*
                 * Re-acquire contents lock.
                 */
                rw_enter(&ip->i_contents, RW_WRITER);
                /*
                 * If the uiomove() failed or if a synchronous
                 * page push failed, fix up i_size.
                 */
                if (error) {
                        if (i_size_changed) {
                                /*
                                 * The uiomove failed, and we
                                 * allocated blocks,so get rid
                                 * of them.
                                 */
                                (void) ud_itrunc(ip, old_i_size, 0, cr);
                        }
                } else {
                        /*
                         * XXX - Can this be out of the loop?
                         */
                        ip->i_flag |= IUPD | ICHG;
                        if (i_size_changed) {
                                ip->i_flag |= IATTCHG;
                        }
                        if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
                            (IEXEC >> 10))) != 0 &&
                            (ip->i_char & (ISUID | ISGID)) != 0 &&
                            secpolicy_vnode_setid_retain(cr,
                            (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
                                /*
                                 * Clear Set-UID & Set-GID bits on
                                 * successful write if not privileged
                                 * and at least one of the execute bits
                                 * is set.  If we always clear Set-GID,
                                 * mandatory file and record locking is
                                 * unuseable.
                                 */
                                ip->i_char &= ~(ISUID | ISGID);
                        }
                }
        } while (error == 0 && uio->uio_resid > 0 && n != 0);

out:
        /*
         * Inode is updated according to this table -
         *
         *      FSYNC   FDSYNC(posix.4)
         *      --------------------------
         *      always@ IATTCHG|IBDWRITE
         *
         * @ -  If we are doing synchronous write the only time we should
         *      not be sync'ing the ip here is if we have the stickyhack
         *      activated, the file is marked with the sticky bit and
         *      no exec bit, the file length has not been changed and
         *      no new blocks have been allocated during this write.
         */
        if ((ip->i_flag & ISYNC) != 0) {
                /*
                 * we have eliminated nosync
                 */
                if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
                    ((ioflag & FSYNC) && iupdat_flag)) {
                        ud_iupdat(ip, 1);
                }
        }

        /*
         * If we've already done a partial-write, terminate
         * the write but return no error.
         */
        if (start_resid != uio->uio_resid) {
                error = 0;
        }
        ip->i_flag &= ~(INOACC | ISYNC);
        ITIMES_NOLOCK(ip);

        return (error);
}

int32_t
ud_multi_strat(struct ud_inode *ip,
    page_t *pp, struct buf *bp, u_offset_t start)
{
        daddr_t bn;
        int32_t error = 0, io_count, contig, alloc_sz, i;
        uint32_t io_off;
        mio_master_t *mm = NULL;
        mio_slave_t *ms = NULL;
        struct buf *rbp;

        ASSERT(!(start & PAGEOFFSET));

        /*
         * Figure out how many buffers to allocate
         */
        io_count = 0;
        for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
                contig = 0;
                if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
                    &bn, &contig)) {
                        goto end;
                }
                if (contig == 0) {
                        goto end;
                }
                contig = MIN(contig, PAGESIZE - io_off);
                if (bn != UDF_HOLE) {
                        io_count ++;
                } else {
                        /*
                         * HOLE
                         */
                        if (bp->b_flags & B_READ) {

                                /*
                                 * This is a hole and is read
                                 * it should be filled with 0's
                                 */
                                pagezero(pp, io_off, contig);
                        }
                }
        }


        if (io_count != 0) {

                /*
                 * Allocate memory for all the
                 * required number of buffers
                 */
                alloc_sz = sizeof (mio_master_t) +
                    (sizeof (mio_slave_t) * io_count);
                mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
                if (mm == NULL) {
                        error = ENOMEM;
                        goto end;
                }

                /*
                 * initialize master
                 */
                mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
                mm->mm_size = alloc_sz;
                mm->mm_bp = bp;
                mm->mm_resid = 0;
                mm->mm_error = 0;
                mm->mm_index = master_index++;

                ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));

                /*
                 * Initialize buffers
                 */
                io_count = 0;
                for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
                        contig = 0;
                        if (error = ud_bmap_read(ip,
                            (u_offset_t)(start + io_off),
                            &bn, &contig)) {
                                goto end;
                        }
                        ASSERT(contig);
                        if ((io_off + contig) > bp->b_bcount) {
                                contig = bp->b_bcount - io_off;
                        }
                        if (bn != UDF_HOLE) {
                                /*
                                 * Clone the buffer
                                 * and prepare to start I/O
                                 */
                                ms->ms_ptr = mm;
                                bioinit(&ms->ms_buf);
                                rbp = bioclone(bp, io_off, (size_t)contig,
                                    bp->b_edev, bn, ud_slave_done,
                                    &ms->ms_buf, KM_NOSLEEP);
                                ASSERT(rbp == &ms->ms_buf);
                                mm->mm_resid += contig;
                                io_count++;
                                ms ++;
                        }
                }

                /*
                 * Start I/O's
                 */
                ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
                for (i = 0; i < io_count; i++) {
                        (void) bdev_strategy(&ms->ms_buf);
                        ms ++;
                }
        }

end:
        if (error != 0) {
                bp->b_flags |= B_ERROR;
                bp->b_error = error;
                if (mm != NULL) {
                        mutex_destroy(&mm->mm_mutex);
                        kmem_free(mm, mm->mm_size);
                }
        }
        return (error);
}

int32_t
ud_slave_done(struct buf *bp)
{
        mio_master_t *mm;
        int32_t resid;

        ASSERT(SEMA_HELD(&bp->b_sem));
        ASSERT((bp->b_flags & B_DONE) == 0);

        mm = ((mio_slave_t *)bp)->ms_ptr;

        /*
         * Propagate error and byte count info from slave struct to
         * the master struct
         */
        mutex_enter(&mm->mm_mutex);
        if (bp->b_flags & B_ERROR) {

                /*
                 * If multiple slave buffers get
                 * error we forget the old errors
                 * this is ok because we any way
                 * cannot return multiple errors
                 */
                mm->mm_error = bp->b_error;
        }
        mm->mm_resid -= bp->b_bcount;
        resid = mm->mm_resid;
        mutex_exit(&mm->mm_mutex);

        /*
         * free up the resources allocated to cloned buffers.
         */
        bp_mapout(bp);
        biofini(bp);

        if (resid == 0) {

                /*
                 * This is the last I/O operation
                 * clean up and return the original buffer
                 */
                if (mm->mm_error) {
                        mm->mm_bp->b_flags |= B_ERROR;
                        mm->mm_bp->b_error = mm->mm_error;
                }
                biodone(mm->mm_bp);
                mutex_destroy(&mm->mm_mutex);
                kmem_free(mm, mm->mm_size);
        }
        return (0);
}
Illumos