root/usr/src/uts/common/fs/ufs/ufs_directio.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2019 Joyent, Inc.
 */

/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */

/*
 * Portions of this source code were derived from Berkeley 4.3 BSD
 * under license from the Regents of the University of California.
 */

#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/param.h>
#include <sys/time.h>
#include <sys/systm.h>
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <sys/kmem.h>
#include <sys/uio.h>
#include <sys/dnlc.h>
#include <sys/conf.h>
#include <sys/mman.h>
#include <sys/pathname.h>
#include <sys/debug.h>
#include <sys/vmsystm.h>
#include <sys/cmn_err.h>
#include <sys/filio.h>
#include <sys/atomic.h>

#include <sys/fssnap_if.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_lockfs.h>
#include <sys/fs/ufs_filio.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_fsdir.h>
#include <sys/fs/ufs_quota.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_panic.h>
#include <sys/dirent.h>         /* must be AFTER <sys/fs/fsdir.h>! */
#include <sys/errno.h>

#include <sys/filio.h>          /* _FIOIO */

#include <vm/hat.h>
#include <vm/page.h>
#include <vm/pvn.h>
#include <vm/as.h>
#include <vm/seg.h>
#include <vm/seg_map.h>
#include <vm/seg_vn.h>
#include <vm/seg_kmem.h>
#include <vm/rm.h>
#include <sys/swap.h>
#include <sys/epm.h>

#include <fs/fs_subr.h>

static void     *ufs_directio_zero_buf;
static int      ufs_directio_zero_len   = 8192;

int     ufs_directio_enabled = 1;       /* feature is enabled */

/*
 * for kstats reader
 */
struct ufs_directio_kstats {
        kstat_named_t   logical_reads;
        kstat_named_t   phys_reads;
        kstat_named_t   hole_reads;
        kstat_named_t   nread;
        kstat_named_t   logical_writes;
        kstat_named_t   phys_writes;
        kstat_named_t   nwritten;
        kstat_named_t   nflushes;
} ufs_directio_kstats = {
        { "logical_reads",      KSTAT_DATA_UINT64 },
        { "phys_reads",         KSTAT_DATA_UINT64 },
        { "hole_reads",         KSTAT_DATA_UINT64 },
        { "nread",              KSTAT_DATA_UINT64 },
        { "logical_writes",     KSTAT_DATA_UINT64 },
        { "phys_writes",        KSTAT_DATA_UINT64 },
        { "nwritten",           KSTAT_DATA_UINT64 },
        { "nflushes",           KSTAT_DATA_UINT64 },
};

kstat_t *ufs_directio_kstatsp;

/*
 * use kmem_cache_create for direct-physio buffers. This has shown
 * a better cache distribution compared to buffers on the
 * stack. It also avoids semaphore construction/deconstruction
 * per request
 */
struct directio_buf {
        struct directio_buf     *next;
        char            *addr;
        size_t          nbytes;
        struct buf      buf;
};
static struct kmem_cache *directio_buf_cache;


/* ARGSUSED */
static int
directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
{
        bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
        return (0);
}

/* ARGSUSED */
static void
directio_buf_destructor(void *dbp, void *cdrarg)
{
        biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
}

void
directio_bufs_init(void)
{
        directio_buf_cache = kmem_cache_create("directio_buf_cache",
            sizeof (struct directio_buf), 0,
            directio_buf_constructor, directio_buf_destructor,
            NULL, NULL, NULL, 0);
}

void
ufs_directio_init(void)
{
        /*
         * kstats
         */
        ufs_directio_kstatsp = kstat_create("ufs", 0,
            "directio", "ufs", KSTAT_TYPE_NAMED,
            sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
            KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
        if (ufs_directio_kstatsp) {
                ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
                kstat_install(ufs_directio_kstatsp);
        }
        /*
         * kzero is broken so we have to use a private buf of zeroes
         */
        ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
        directio_bufs_init();
}

/*
 * Wait for the first direct IO operation to finish
 */
static int
directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
{
        buf_t   *bp;
        int     error;

        /*
         * Wait for IO to finish
         */
        bp = &dbp->buf;
        error = biowait(bp);

        /*
         * bytes_io will be used to figure out a resid
         * for the caller. The resid is approximated by reporting
         * the bytes following the first failed IO as the residual.
         *
         * I am cautious about using b_resid because I
         * am not sure how well the disk drivers maintain it.
         */
        if (error)
                if (bp->b_resid)
                        *bytes_iop = bp->b_bcount - bp->b_resid;
                else
                        *bytes_iop = 0;
        else
                *bytes_iop += bp->b_bcount;
        /*
         * Release direct IO resources
         */
        bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
        kmem_cache_free(directio_buf_cache, dbp);
        return (error);
}

/*
 * Wait for all of the direct IO operations to finish
 */

static int
directio_wait(struct directio_buf *tail, long *bytes_iop)
{
        int     error = 0, newerror;
        struct directio_buf     *dbp;

        /*
         * The linked list of directio buf structures is maintained
         * in reverse order (tail->last request->penultimate request->...)
         */
        while ((dbp = tail) != NULL) {
                tail = dbp->next;
                newerror = directio_wait_one(dbp, bytes_iop);
                if (error == 0)
                        error = newerror;
        }
        return (error);
}
/*
 * Initiate direct IO request
 */
static void
directio_start(struct ufsvfs *ufsvfsp, struct inode *ip, size_t nbytes,
    offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
    struct directio_buf **tailp, page_t **pplist)
{
        buf_t *bp;
        struct directio_buf *dbp;

        /*
         * Allocate a directio buf header
         *   Note - list is maintained in reverse order.
         *   directio_wait_one() depends on this fact when
         *   adjusting the ``bytes_io'' param. bytes_io
         *   is used to compute a residual in the case of error.
         */
        dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
        dbp->next = *tailp;
        *tailp = dbp;

        /*
         * Initialize buf header
         */
        dbp->addr = addr;
        dbp->nbytes = nbytes;
        bp = &dbp->buf;
        bp->b_edev = ip->i_dev;
        bp->b_lblkno = btodt(offset);
        bp->b_bcount = nbytes;
        bp->b_un.b_addr = addr;
        bp->b_proc = procp;
        bp->b_file = ip->i_vnode;

        /*
         * Note that S_WRITE implies B_READ and vice versa: a read(2)
         * will B_READ data from the filesystem and S_WRITE it into
         * the user's buffer; a write(2) will S_READ data from the
         * user's buffer and B_WRITE it to the filesystem.
         */
        if (rw == S_WRITE) {
                bp->b_flags = B_BUSY | B_PHYS | B_READ;
                ufs_directio_kstats.phys_reads.value.ui64++;
                ufs_directio_kstats.nread.value.ui64 += nbytes;
        } else {
                bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
                ufs_directio_kstats.phys_writes.value.ui64++;
                ufs_directio_kstats.nwritten.value.ui64 += nbytes;
        }
        bp->b_shadow = pplist;
        if (pplist != NULL)
                bp->b_flags |= B_SHADOW;

        /*
         * Issue I/O request.
         */
        ufsvfsp->vfs_iotstamp = ddi_get_lbolt();
        if (ufsvfsp->vfs_snapshot)
                fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
        else
                (void) bdev_strategy(bp);

        if (rw == S_WRITE)
                lwp_stat_update(LWP_STAT_OUBLK, 1);
        else
                lwp_stat_update(LWP_STAT_INBLK, 1);

}

uint32_t        ufs_shared_writes;      /* writes done w/ lock shared */
uint32_t        ufs_cur_writes;         /* # concurrent writes */
uint32_t        ufs_maxcur_writes;      /* high water concurrent writes */
uint32_t        ufs_posix_hits;         /* writes done /w lock excl. */

/*
 * Force POSIX syncronous data integrity on all writes for testing.
 */
uint32_t        ufs_force_posix_sdi = 0;

/*
 * Direct Write
 */

int
ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
    cred_t *cr, int *statusp)
{
        long            resid, bytes_written;
        u_offset_t      size, uoff;
        uio_t           *uio = arg_uio;
        rlim64_t        limit = uio->uio_llimit;
        int             on, n, error, newerror, len, has_holes;
        daddr_t         bn;
        size_t          nbytes;
        struct fs       *fs;
        vnode_t         *vp;
        iovec_t         *iov;
        struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
        struct proc     *procp;
        struct as       *as;
        struct directio_buf     *tail;
        int             exclusive, ncur, bmap_peek;
        uio_t           copy_uio;
        iovec_t         copy_iov;
        char            *copy_base;
        long            copy_resid;

        /*
         * assume that directio isn't possible (normal case)
         */
        *statusp = DIRECTIO_FAILURE;

        /*
         * Don't go direct
         */
        if (ufs_directio_enabled == 0)
                return (0);

        /*
         * mapped file; nevermind
         */
        if (ip->i_mapcnt)
                return (0);

        /*
         * CAN WE DO DIRECT IO?
         */
        uoff = uio->uio_loffset;
        resid = uio->uio_resid;

        /*
         * beyond limit
         */
        if (uoff + resid > limit)
                return (0);

        /*
         * must be sector aligned
         */
        if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
                return (0);

        /*
         * SHOULD WE DO DIRECT IO?
         */
        size = ip->i_size;
        has_holes = -1;

        /*
         * only on regular files; no metadata
         */
        if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
                return (0);

        /*
         * Synchronous, allocating writes run very slow in Direct-Mode
         *      XXX - can be fixed with bmap_write changes for large writes!!!
         *      XXX - can be fixed for updates to "almost-full" files
         *      XXX - WARNING - system hangs if bmap_write() has to
         *                      allocate lots of pages since pageout
         *                      suspends on locked inode
         */
        if (!rewrite && (ip->i_flag & ISYNC)) {
                if ((uoff + resid) > size)
                        return (0);
                has_holes = bmap_has_holes(ip);
                if (has_holes)
                        return (0);
        }

        /*
         * Each iovec must be short aligned and sector aligned.  If
         * one is not, then kmem_alloc a new buffer and copy all of
         * the smaller buffers into the new buffer.  This new
         * buffer will be short aligned and sector aligned.
         */
        iov = uio->uio_iov;
        nbytes = uio->uio_iovcnt;
        while (nbytes--) {
                if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
                    (intptr_t)(iov->iov_base) & 1) {
                        copy_resid = uio->uio_resid;
                        copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
                        if (copy_base == NULL)
                                return (0);
                        copy_iov.iov_base = copy_base;
                        copy_iov.iov_len = copy_resid;
                        copy_uio.uio_iov = &copy_iov;
                        copy_uio.uio_iovcnt = 1;
                        copy_uio.uio_segflg = UIO_SYSSPACE;
                        copy_uio.uio_extflg = UIO_COPY_DEFAULT;
                        copy_uio.uio_loffset = uio->uio_loffset;
                        copy_uio.uio_resid = uio->uio_resid;
                        copy_uio.uio_llimit = uio->uio_llimit;
                        error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
                        if (error) {
                                kmem_free(copy_base, copy_resid);
                                return (0);
                        }
                        uio = &copy_uio;
                        break;
                }
                iov++;
        }

        /*
         * From here on down, all error exits must go to errout and
         * not simply return a 0.
         */

        /*
         * DIRECTIO
         */

        fs = ip->i_fs;

        /*
         * POSIX check. If attempting a concurrent re-write, make sure
         * that this will be a single request to the driver to meet
         * POSIX synchronous data integrity requirements.
         */
        bmap_peek = 0;
        if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
                int upgrade = 0;

                /* check easy conditions first */
                if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
                        upgrade = 1;
                } else {
                        /* now look for contiguous allocation */
                        len = (ssize_t)blkroundup(fs, resid);
                        error = bmap_read(ip, uoff, &bn, &len);
                        if (error || bn == UFS_HOLE || len == 0)
                                goto errout;
                        /* save a call to bmap_read later */
                        bmap_peek = 1;
                        if (len < resid)
                                upgrade = 1;
                }
                if (upgrade) {
                        rw_exit(&ip->i_contents);
                        rw_enter(&ip->i_contents, RW_WRITER);
                        ufs_posix_hits++;
                }
        }


        /*
         * allocate space
         */

        /*
         * If attempting a re-write, there is no allocation to do.
         * bmap_write would trip an ASSERT if i_contents is held shared.
         */
        if (rewrite)
                goto skip_alloc;

        do {
                on = (int)blkoff(fs, uoff);
                n = (int)MIN(fs->fs_bsize - on, resid);
                if ((uoff + n) > ip->i_size) {
                        error = bmap_write(ip, uoff, (int)(on + n),
                            (int)(uoff & (offset_t)MAXBOFFSET) == 0,
                            NULL, cr);
                        /* Caller is responsible for updating i_seq if needed */
                        if (error)
                                break;
                        ip->i_size = uoff + n;
                        ip->i_flag |= IATTCHG;
                } else if (n == MAXBSIZE) {
                        error = bmap_write(ip, uoff, (int)(on + n),
                            BI_ALLOC_ONLY, NULL, cr);
                        /* Caller is responsible for updating i_seq if needed */
                } else {
                        if (has_holes < 0)
                                has_holes = bmap_has_holes(ip);
                        if (has_holes) {
                                uint_t  blk_size;
                                u_offset_t offset;

                                offset = uoff & (offset_t)fs->fs_bmask;
                                blk_size = (int)blksize(fs, ip,
                                    (daddr_t)lblkno(fs, offset));
                                error = bmap_write(ip, uoff, blk_size,
                                    BI_NORMAL, NULL, cr);
                                /*
                                 * Caller is responsible for updating
                                 * i_seq if needed
                                 */
                        } else
                                error = 0;
                }
                if (error)
                        break;
                uoff += n;
                resid -= n;
                /*
                 * if file has grown larger than 2GB, set flag
                 * in superblock if not already set
                 */
                if ((ip->i_size > MAXOFF32_T) &&
                    !(fs->fs_flags & FSLARGEFILES)) {
                        ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
                        mutex_enter(&ufsvfsp->vfs_lock);
                        fs->fs_flags |= FSLARGEFILES;
                        ufs_sbwrite(ufsvfsp);
                        mutex_exit(&ufsvfsp->vfs_lock);
                }
        } while (resid);

        if (error) {
                /*
                 * restore original state
                 */
                if (resid) {
                        if (size == ip->i_size)
                                goto errout;
                        (void) ufs_itrunc(ip, size, 0, cr);
                }
                /*
                 * try non-directio path
                 */
                goto errout;
        }
skip_alloc:

        /*
         * get rid of cached pages
         */
        vp = ITOV(ip);
        exclusive = rw_write_held(&ip->i_contents);
        if (vn_has_cached_data(vp)) {
                if (!exclusive) {
                        /*
                         * Still holding i_rwlock, so no allocations
                         * can happen after dropping contents.
                         */
                        rw_exit(&ip->i_contents);
                        rw_enter(&ip->i_contents, RW_WRITER);
                }
                (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
                    B_INVAL, cr, NULL);
                if (vn_has_cached_data(vp))
                        goto errout;
                if (!exclusive)
                        rw_downgrade(&ip->i_contents);
                ufs_directio_kstats.nflushes.value.ui64++;
        }

        /*
         * Direct Writes
         */

        if (!exclusive) {
                ufs_shared_writes++;
                ncur = atomic_inc_32_nv(&ufs_cur_writes);
                if (ncur > ufs_maxcur_writes)
                        ufs_maxcur_writes = ncur;
        }

        /*
         * proc and as are for VM operations in directio_start()
         */
        if (uio->uio_segflg == UIO_USERSPACE) {
                procp = ttoproc(curthread);
                as = procp->p_as;
        } else {
                procp = NULL;
                as = &kas;
        }
        *statusp = DIRECTIO_SUCCESS;
        error = 0;
        newerror = 0;
        resid = uio->uio_resid;
        bytes_written = 0;
        ufs_directio_kstats.logical_writes.value.ui64++;
        while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
                size_t pglck_len, pglck_size;
                caddr_t pglck_base;
                page_t **pplist, **spplist;

                tail = NULL;

                /*
                 * Adjust number of bytes
                 */
                iov = uio->uio_iov;
                pglck_len = (size_t)MIN(iov->iov_len, resid);
                pglck_base = iov->iov_base;
                if (pglck_len == 0) {
                        uio->uio_iov++;
                        uio->uio_iovcnt--;
                        continue;
                }

                /*
                 * Try to Lock down the largest chunck of pages possible.
                 */
                pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
                error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);

                if (error)
                        break;

                pglck_size = pglck_len;
                while (pglck_len) {

                        nbytes = pglck_len;
                        uoff = uio->uio_loffset;

                        if (!bmap_peek) {

                                /*
                                 * Re-adjust number of bytes to contiguous
                                 * range. May have already called bmap_read
                                 * in the case of a concurrent rewrite.
                                 */
                                len = (ssize_t)blkroundup(fs, nbytes);
                                error = bmap_read(ip, uoff, &bn, &len);
                                if (error)
                                        break;
                                if (bn == UFS_HOLE || len == 0)
                                        break;
                        }
                        nbytes = (size_t)MIN(nbytes, len);
                        bmap_peek = 0;

                        /*
                         * Get the pagelist pointer for this offset to be
                         * passed to directio_start.
                         */

                        if (pplist != NULL)
                                spplist = pplist +
                                    btop((uintptr_t)iov->iov_base -
                                    ((uintptr_t)pglck_base & PAGEMASK));
                        else
                                spplist = NULL;

                        /*
                         * Kick off the direct write requests
                         */
                        directio_start(ufsvfsp, ip, nbytes, ldbtob(bn),
                            iov->iov_base, S_READ, procp, &tail, spplist);

                        /*
                         * Adjust pointers and counters
                         */
                        iov->iov_len -= nbytes;
                        iov->iov_base += nbytes;
                        uio->uio_loffset += nbytes;
                        resid -= nbytes;
                        pglck_len -= nbytes;
                }

                /*
                 * Wait for outstanding requests
                 */
                newerror = directio_wait(tail, &bytes_written);

                /*
                 * Release VM resources
                 */
                as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);

        }

        if (!exclusive) {
                atomic_dec_32(&ufs_cur_writes);
                /*
                 * If this write was done shared, readers may
                 * have pulled in unmodified pages. Get rid of
                 * these potentially stale pages.
                 */
                if (vn_has_cached_data(vp)) {
                        rw_exit(&ip->i_contents);
                        rw_enter(&ip->i_contents, RW_WRITER);
                        (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
                            B_INVAL, cr, NULL);
                        ufs_directio_kstats.nflushes.value.ui64++;
                        rw_downgrade(&ip->i_contents);
                }
        }

        /*
         * If error, adjust resid to begin at the first
         * un-writable byte.
         */
        if (error == 0)
                error = newerror;
        if (error)
                resid = uio->uio_resid - bytes_written;
        arg_uio->uio_resid = resid;

        if (!rewrite) {
                ip->i_flag |= IUPD | ICHG;
                /* Caller will update i_seq */
                TRANS_INODE(ip->i_ufsvfs, ip);
        }
        /*
         * If there is a residual; adjust the EOF if necessary
         */
        if (resid) {
                if (size != ip->i_size) {
                        if (uio->uio_loffset > size)
                                size = uio->uio_loffset;
                        (void) ufs_itrunc(ip, size, 0, cr);
                }
        }

        if (uio == &copy_uio)
                kmem_free(copy_base, copy_resid);

        return (error);

errout:
        if (uio == &copy_uio)
                kmem_free(copy_base, copy_resid);

        return (0);
}
/*
 * Direct read of a hole
 */
static int
directio_hole(struct uio *uio, size_t nbytes)
{
        int             error = 0, nzero;
        uio_t           phys_uio;
        iovec_t         phys_iov;

        ufs_directio_kstats.hole_reads.value.ui64++;
        ufs_directio_kstats.nread.value.ui64 += nbytes;

        phys_iov.iov_base = uio->uio_iov->iov_base;
        phys_iov.iov_len = nbytes;

        phys_uio.uio_iov = &phys_iov;
        phys_uio.uio_iovcnt = 1;
        phys_uio.uio_resid = phys_iov.iov_len;
        phys_uio.uio_segflg = uio->uio_segflg;
        phys_uio.uio_extflg = uio->uio_extflg;
        while (error == 0 && phys_uio.uio_resid) {
                nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
                error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
                    &phys_uio);
        }
        return (error);
}

/*
 * Direct Read
 */
int
ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
{
        ssize_t         resid, bytes_read;
        u_offset_t      size, uoff;
        int             error, newerror, len;
        size_t          nbytes;
        struct fs       *fs;
        vnode_t         *vp;
        daddr_t         bn;
        iovec_t         *iov;
        struct ufsvfs   *ufsvfsp = ip->i_ufsvfs;
        struct proc     *procp;
        struct as       *as;
        struct directio_buf     *tail;

        /*
         * assume that directio isn't possible (normal case)
         */
        *statusp = DIRECTIO_FAILURE;

        /*
         * Don't go direct
         */
        if (ufs_directio_enabled == 0)
                return (0);

        /*
         * mapped file; nevermind
         */
        if (ip->i_mapcnt)
                return (0);

        /*
         * CAN WE DO DIRECT IO?
         */
        /*
         * must be sector aligned
         */
        uoff = uio->uio_loffset;
        resid = uio->uio_resid;
        if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
                return (0);
        /*
         * must be short aligned and sector aligned
         */
        iov = uio->uio_iov;
        nbytes = uio->uio_iovcnt;
        while (nbytes--) {
                if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
                        return (0);
                if ((intptr_t)(iov++->iov_base) & 1)
                        return (0);
        }

        /*
         * DIRECTIO
         */
        fs = ip->i_fs;

        /*
         * don't read past EOF
         */
        size = ip->i_size;

        /*
         * The file offset is past EOF so bail out here; we don't want
         * to update uio_resid and make it look like we read something.
         * We say that direct I/O was a success to avoid having rdip()
         * go through the same "read past EOF logic".
         */
        if (uoff >= size) {
                *statusp = DIRECTIO_SUCCESS;
                return (0);
        }

        /*
         * The read would extend past EOF so make it smaller.
         */
        if ((uoff + resid) > size) {
                resid = size - uoff;
                /*
                 * recheck sector alignment
                 */
                if (resid & (DEV_BSIZE - 1))
                        return (0);
        }

        /*
         * At this point, we know there is some real work to do.
         */
        ASSERT(resid);

        /*
         * get rid of cached pages
         */
        vp = ITOV(ip);
        if (vn_has_cached_data(vp)) {
                rw_exit(&ip->i_contents);
                rw_enter(&ip->i_contents, RW_WRITER);
                (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
                    B_INVAL, cr, NULL);
                if (vn_has_cached_data(vp))
                        return (0);
                rw_downgrade(&ip->i_contents);
                ufs_directio_kstats.nflushes.value.ui64++;
        }
        /*
         * Direct Reads
         */

        /*
         * proc and as are for VM operations in directio_start()
         */
        if (uio->uio_segflg == UIO_USERSPACE) {
                procp = ttoproc(curthread);
                as = procp->p_as;
        } else {
                procp = NULL;
                as = &kas;
        }

        *statusp = DIRECTIO_SUCCESS;
        error = 0;
        newerror = 0;
        bytes_read = 0;
        ufs_directio_kstats.logical_reads.value.ui64++;
        while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
                size_t pglck_len, pglck_size;
                caddr_t pglck_base;
                page_t **pplist, **spplist;

                tail = NULL;

                /*
                 * Adjust number of bytes
                 */
                iov = uio->uio_iov;
                pglck_len = (size_t)MIN(iov->iov_len, resid);
                pglck_base = iov->iov_base;
                if (pglck_len == 0) {
                        uio->uio_iov++;
                        uio->uio_iovcnt--;
                        continue;
                }

                /*
                 * Try to Lock down the largest chunck of pages possible.
                 */
                pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
                error = as_pagelock(as, &pplist, pglck_base,
                    pglck_len, S_WRITE);

                if (error)
                        break;

                pglck_size = pglck_len;
                while (pglck_len) {

                        nbytes = pglck_len;
                        uoff = uio->uio_loffset;

                        /*
                         * Re-adjust number of bytes to contiguous range
                         */
                        len = (ssize_t)blkroundup(fs, nbytes);
                        error = bmap_read(ip, uoff, &bn, &len);
                        if (error)
                                break;

                        if (bn == UFS_HOLE) {
                                nbytes = (size_t)MIN(fs->fs_bsize -
                                    (long)blkoff(fs, uoff), nbytes);
                                error = directio_hole(uio, nbytes);
                                /*
                                 * Hole reads are not added to the list
                                 * processed by directio_wait() below so
                                 * account for bytes read here.
                                 */
                                if (!error)
                                        bytes_read += nbytes;
                        } else {
                                nbytes = (size_t)MIN(nbytes, len);

                                /*
                                 * Get the pagelist pointer for this offset
                                 * to be passed to directio_start.
                                 */
                                if (pplist != NULL)
                                        spplist = pplist +
                                            btop((uintptr_t)iov->iov_base -
                                            ((uintptr_t)pglck_base & PAGEMASK));
                                else
                                        spplist = NULL;

                                /*
                                 * Kick off the direct read requests
                                 */
                                directio_start(ufsvfsp, ip, nbytes,
                                    ldbtob(bn), iov->iov_base,
                                    S_WRITE, procp, &tail, spplist);
                        }

                        if (error)
                                break;

                        /*
                         * Adjust pointers and counters
                         */
                        iov->iov_len -= nbytes;
                        iov->iov_base += nbytes;
                        uio->uio_loffset += nbytes;
                        resid -= nbytes;
                        pglck_len -= nbytes;
                }

                /*
                 * Wait for outstanding requests
                 */
                newerror = directio_wait(tail, &bytes_read);
                /*
                 * Release VM resources
                 */
                as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);

        }

        /*
         * If error, adjust resid to begin at the first
         * un-read byte.
         */
        if (error == 0)
                error = newerror;
        uio->uio_resid -= bytes_read;
        return (error);
}