root/usr/src/uts/common/fs/ufs/ufs_alloc.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T     */
/*        All Rights Reserved   */

/*
 * University Copyright- Copyright (c) 1982, 1986, 1988
 * The Regents of the University of California
 * All Rights Reserved
 *
 * University Acknowledgment- Portions of this document are derived from
 * software developed by the University of California, Berkeley, and its
 * contributors.
 */

#include <sys/condvar_impl.h>
#include <sys/types.h>
#include <sys/t_lock.h>
#include <sys/debug.h>
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/signal.h>
#include <sys/cred.h>
#include <sys/proc.h>
#include <sys/disp.h>
#include <sys/user.h>
#include <sys/buf.h>
#include <sys/vfs.h>
#include <sys/vnode.h>
#include <sys/acl.h>
#include <sys/fs/ufs_fs.h>
#include <sys/fs/ufs_inode.h>
#include <sys/fs/ufs_acl.h>
#include <sys/fs/ufs_bio.h>
#include <sys/fs/ufs_quota.h>
#include <sys/kmem.h>
#include <sys/fs/ufs_trans.h>
#include <sys/fs/ufs_panic.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/sysmacros.h>
#include <sys/file.h>
#include <sys/fcntl.h>
#include <sys/flock.h>
#include <fs/fs_subr.h>
#include <sys/cmn_err.h>
#include <sys/policy.h>
#include <sys/fs/ufs_log.h>

static ino_t    hashalloc();
static daddr_t  fragextend();
static daddr_t  alloccg();
static daddr_t  alloccgblk();
static ino_t    ialloccg();
static daddr_t  mapsearch();
static int      findlogstartcg();

extern int      inside[], around[];
extern uchar_t  *fragtbl[];
void delay();

/*
 * Allocate a block in the file system.
 *
 * The size of the requested block is given, which must be some
 * multiple of fs_fsize and <= fs_bsize.
 * A preference may be optionally specified. If a preference is given
 * the following hierarchy is used to allocate a block:
 *   1) allocate the requested block.
 *   2) allocate a rotationally optimal block in the same cylinder.
 *   3) allocate a block in the same cylinder group.
 *   4) quadratically rehash into other cylinder groups, until an
 *      available block is located.
 * If no block preference is given the following hierarchy is used
 * to allocate a block:
 *   1) allocate a block in the cylinder group that contains the
 *      inode for the file.
 *   2) quadratically rehash into other cylinder groups, until an
 *      available block is located.
 */
int
alloc(struct inode *ip, daddr_t bpref, int size, daddr_t *bnp, cred_t *cr)
{
        struct fs *fs;
        struct ufsvfs *ufsvfsp;
        daddr_t bno;
        int cg;
        int err;
        char *errmsg = NULL;
        size_t len;
        clock_t now;

        ufsvfsp = ip->i_ufsvfs;
        fs = ufsvfsp->vfs_fs;
        if ((unsigned)size > fs->fs_bsize || fragoff(fs, size) != 0) {
                err = ufs_fault(ITOV(ip), "alloc: bad size, dev = 0x%lx,"
                    " bsize = %d, size = %d, fs = %s\n",
                    ip->i_dev, fs->fs_bsize, size, fs->fs_fsmnt);
                return (err);
        }
        if (size == fs->fs_bsize && fs->fs_cstotal.cs_nbfree == 0)
                goto nospace;
        if (freespace(fs, ufsvfsp) <= 0 &&
            secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
                goto nospace;
        err = chkdq(ip, (long)btodb(size), 0, cr, &errmsg, &len);
        /* Note that may not have err, but may have errmsg */
        if (errmsg != NULL) {
                uprintf(errmsg);
                kmem_free(errmsg, len);
                errmsg = NULL;
        }
        if (err)
                return (err);
        if (bpref >= fs->fs_size)
                bpref = 0;
        if (bpref == 0)
                cg = (int)itog(fs, ip->i_number);
        else
                cg = dtog(fs, bpref);

        bno = (daddr_t)hashalloc(ip, cg, (long)bpref, size,
            (ulong_t (*)())alloccg);
        if (bno > 0) {
                *bnp = bno;
                return (0);
        }

        /*
         * hashalloc() failed because some other thread grabbed
         * the last block so unwind the quota operation.  We can
         * ignore the return because subtractions don't fail and
         * size is guaranteed to be >= zero by our caller.
         */
        (void) chkdq(ip, -(long)btodb(size), 0, cr, (char **)NULL,
            (size_t *)NULL);

nospace:
        now = ddi_get_lbolt();
        mutex_enter(&ufsvfsp->vfs_lock);
        if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
            (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
                ufsvfsp->vfs_lastwhinetime = now;
                cmn_err(CE_NOTE, "alloc: %s: file system full", fs->fs_fsmnt);
        }
        mutex_exit(&ufsvfsp->vfs_lock);
        return (ENOSPC);
}

/*
 * Reallocate a fragment to a bigger size
 *
 * The number and size of the old block is given, and a preference
 * and new size is also specified.  The allocator attempts to extend
 * the original block.  Failing that, the regular block allocator is
 * invoked to get an appropriate block.
 */
int
realloccg(struct inode *ip, daddr_t bprev, daddr_t bpref, int osize,
    int nsize, daddr_t *bnp, cred_t *cr)
{
        daddr_t bno;
        struct fs *fs;
        struct ufsvfs *ufsvfsp;
        int cg, request;
        int err;
        char *errmsg = NULL;
        size_t len;
        clock_t now;

        ufsvfsp = ip->i_ufsvfs;
        fs = ufsvfsp->vfs_fs;
        if ((unsigned)osize > fs->fs_bsize || fragoff(fs, osize) != 0 ||
            (unsigned)nsize > fs->fs_bsize || fragoff(fs, nsize) != 0) {
                err = ufs_fault(ITOV(ip),
                    "realloccg: bad size, dev=0x%lx, bsize=%d, "
                    "osize=%d, nsize=%d, fs=%s\n",
                    ip->i_dev, fs->fs_bsize, osize, nsize, fs->fs_fsmnt);
                return (err);
        }
        if (freespace(fs, ufsvfsp) <= 0 &&
            secpolicy_fs_minfree(cr, ufsvfsp->vfs_vfs) != 0)
                goto nospace;
        if (bprev == 0) {
                err = ufs_fault(ITOV(ip),
                    "realloccg: bad bprev, dev = 0x%lx, bsize = %d,"
                    " bprev = %ld, fs = %s\n", ip->i_dev, fs->fs_bsize, bprev,
                    fs->fs_fsmnt);
                return (err);
        }
        err = chkdq(ip, (long)btodb(nsize - osize), 0, cr, &errmsg, &len);
        /* Note that may not have err, but may have errmsg */
        if (errmsg != NULL) {
                uprintf(errmsg);
                kmem_free(errmsg, len);
                errmsg = NULL;
        }
        if (err)
                return (err);
        cg = dtog(fs, bprev);
        bno = fragextend(ip, cg, (long)bprev, osize, nsize);
        if (bno != 0) {
                *bnp = bno;
                return (0);
        }
        if (bpref >= fs->fs_size)
                bpref = 0;

        /*
         * When optimizing for time we allocate a full block and
         * then only use the upper portion for this request. When
         * this file grows again it will grow into the unused portion
         * of the block (See fragextend() above).  This saves time
         * because an extra disk write would be needed if the frags
         * following the current allocation were not free. The extra
         * disk write is needed to move the data from its current
         * location into the newly allocated position.
         *
         * When optimizing for space we allocate a run of frags
         * that is just the right size for this request.
         */
        request = (fs->fs_optim == FS_OPTTIME) ? fs->fs_bsize : nsize;
        bno = (daddr_t)hashalloc(ip, cg, (long)bpref, request,
            (ulong_t (*)())alloccg);
        if (bno > 0) {
                *bnp = bno;
                if (nsize < request)
                        (void) free(ip, bno + numfrags(fs, nsize),
                            (off_t)(request - nsize), I_NOCANCEL);
                return (0);
        }

        /*
         * hashalloc() failed because some other thread grabbed
         * the last block so unwind the quota operation.  We can
         * ignore the return because subtractions don't fail, and
         * our caller guarantees nsize >= osize.
         */
        (void) chkdq(ip, -(long)btodb(nsize - osize), 0, cr, (char **)NULL,
            (size_t *)NULL);

nospace:
        now = ddi_get_lbolt();
        mutex_enter(&ufsvfsp->vfs_lock);
        if ((now - ufsvfsp->vfs_lastwhinetime) > (hz << 2) &&
            (!(TRANS_ISTRANS(ufsvfsp)) || !(ip->i_flag & IQUIET))) {
                ufsvfsp->vfs_lastwhinetime = now;
                cmn_err(CE_NOTE,
                    "realloccg %s: file system full", fs->fs_fsmnt);
        }
        mutex_exit(&ufsvfsp->vfs_lock);
        return (ENOSPC);
}

/*
 * Allocate an inode in the file system.
 *
 * A preference may be optionally specified. If a preference is given
 * the following hierarchy is used to allocate an inode:
 *   1) allocate the requested inode.
 *   2) allocate an inode in the same cylinder group.
 *   3) quadratically rehash into other cylinder groups, until an
 *      available inode is located.
 * If no inode preference is given the following hierarchy is used
 * to allocate an inode:
 *   1) allocate an inode in cylinder group 0.
 *   2) quadratically rehash into other cylinder groups, until an
 *      available inode is located.
 */
int
ufs_ialloc(struct inode *pip,
    ino_t ipref, mode_t mode, struct inode **ipp, cred_t *cr)
{
        struct inode *ip;
        struct fs *fs;
        int cg;
        ino_t ino;
        int err;
        int nifree;
        struct ufsvfs *ufsvfsp = pip->i_ufsvfs;
        char *errmsg = NULL;
        size_t len;

        ASSERT(RW_WRITE_HELD(&pip->i_rwlock));
        fs = pip->i_fs;
loop:
        nifree = fs->fs_cstotal.cs_nifree;

        if (nifree == 0)
                goto noinodes;
        /*
         * Shadow inodes don't count against a user's inode allocation.
         * They are an implementation method and not a resource.
         */
        if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
                err = chkiq((struct ufsvfs *)ITOV(pip)->v_vfsp->vfs_data,
                    /* change */ 1, (struct inode *)NULL, crgetuid(cr), 0,
                    cr, &errmsg, &len);
                /*
                 * As we haven't acquired any locks yet, dump the message
                 * now.
                 */
                if (errmsg != NULL) {
                        uprintf(errmsg);
                        kmem_free(errmsg, len);
                        errmsg = NULL;
                }
                if (err)
                        return (err);
        }

        if (ipref >= (ulong_t)(fs->fs_ncg * fs->fs_ipg))
                ipref = 0;
        cg = (int)itog(fs, ipref);
        ino = (ino_t)hashalloc(pip, cg, (long)ipref, (int)mode,
            (ulong_t (*)())ialloccg);
        if (ino == 0) {
                if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
                        /*
                         * We can safely ignore the return from chkiq()
                         * because deallocations can only fail if we
                         * can't get the user's quota info record off
                         * the disk due to an I/O error.  In that case,
                         * the quota subsystem is already messed up.
                         */
                        (void) chkiq(ufsvfsp, /* change */ -1,
                            (struct inode *)NULL, crgetuid(cr), 0, cr,
                            (char **)NULL, (size_t *)NULL);
                }
                goto noinodes;
        }
        err = ufs_iget(pip->i_vfs, ino, ipp, cr);
        if (err) {
                if ((mode != IFSHAD) && (mode != IFATTRDIR)) {
                        /*
                         * See above comment about why it is safe to ignore an
                         * error return here.
                         */
                        (void) chkiq(ufsvfsp, /* change */ -1,
                            (struct inode *)NULL, crgetuid(cr), 0, cr,
                            (char **)NULL, (size_t *)NULL);
                }
                ufs_ifree(pip, ino, 0);
                return (err);
        }
        ip = *ipp;
        ASSERT(!ip->i_ufs_acl);
        ASSERT(!ip->i_dquot);
        rw_enter(&ip->i_contents, RW_WRITER);

        /*
         * Check if we really got a free inode, if not then complain
         * and mark the inode ISTALE so that it will be freed by the
         * ufs idle thread eventually and will not be sent to ufs_delete().
         */
        if (ip->i_mode || (ip->i_nlink > 0)) {
                ip->i_flag |= ISTALE;
                rw_exit(&ip->i_contents);
                VN_RELE(ITOV(ip));
                cmn_err(CE_WARN,
                    "%s: unexpected allocated inode %d, run fsck(8)%s",
                    fs->fs_fsmnt, (int)ino,
                    (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
                goto loop;
        }

        /*
         * Check the inode has no size or data blocks.
         * This could have happened if the truncation failed when
         * deleting the inode. It used to be possible for this to occur
         * if a block allocation failed when iteratively truncating a
         * large file using logging and with a full file system.
         * This was fixed with bug fix 4348738. However, truncation may
         * still fail on an IO error. So in all cases for safety and
         * security we clear out the size; the blocks allocated; and
         * pointers to the blocks. This will ultimately cause a fsck
         * error of un-accounted for blocks, but its a fairly benign error,
         * and possibly the correct thing to do anyway as accesssing those
         * blocks agains may lead to more IO errors.
         */
        if (ip->i_size || ip->i_blocks) {
                int i;

                if (ip->i_size) {
                        cmn_err(CE_WARN,
                            "%s: free inode %d had size 0x%llx, run fsck(8)%s",
                            fs->fs_fsmnt, (int)ino, ip->i_size,
                            (TRANS_ISTRANS(ufsvfsp) ? " -o f" : ""));
                }
                /*
                 * Clear any garbage left behind.
                 */
                ip->i_size = (u_offset_t)0;
                ip->i_blocks = 0;
                for (i = 0; i < NDADDR; i++)
                        ip->i_db[i] = 0;
                for (i = 0; i < NIADDR; i++)
                        ip->i_ib[i] = 0;
        }

        /*
         * Initialize the link count
         */
        ip->i_nlink = 0;

        /*
         * Clear the old flags
         */
        ip->i_flag &= IREF;

        /*
         * Access times are not really defined if the fs is mounted
         * with 'noatime'. But it can cause nfs clients to fail
         * open() if the atime is not a legal value. Set a legal value
         * here when the inode is allocated.
         */
        if (ufsvfsp->vfs_noatime) {
                mutex_enter(&ufs_iuniqtime_lock);
                ip->i_atime = iuniqtime;
                mutex_exit(&ufs_iuniqtime_lock);
        }
        rw_exit(&ip->i_contents);
        return (0);
noinodes:
        if (!(TRANS_ISTRANS(ufsvfsp)) || !(pip->i_flag & IQUIET))
                cmn_err(CE_NOTE, "%s: out of inodes\n", fs->fs_fsmnt);
        return (ENOSPC);
}

/*
 * Find a cylinder group to place a directory.
 * Returns an inumber within the selected cylinder group.
 * Note, the vfs_lock is not needed as we don't require exact cg summary info.
 *
 * If the switch ufs_close_dirs is set, then the policy is to use
 * the current cg if it has more than 25% free inodes and more
 * than 25% free blocks. Otherwise the cgs are searched from
 * the beginning and the first cg with the same criteria is
 * used. If that is also null then we revert to the old algorithm.
 * This tends to cluster files at the beginning of the disk
 * until the disk gets full.
 *
 * Otherwise if ufs_close_dirs is not set then the original policy is
 * used which is to select from among those cylinder groups with
 * above the average number of free inodes, the one with the smallest
 * number of directories.
 */

int ufs_close_dirs = 1; /* allocate directories close as possible */

ino_t
dirpref(inode_t *dp)
{
        int cg, minndir, mincg, avgifree, mininode, minbpg, ifree;
        struct fs *fs = dp->i_fs;

        cg = itog(fs, dp->i_number);
        mininode = fs->fs_ipg >> 2;
        minbpg = fs->fs_maxbpg >> 2;
        if (ufs_close_dirs &&
            (fs->fs_cs(fs, cg).cs_nifree > mininode) &&
            (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
                return (dp->i_number);
        }

        avgifree = fs->fs_cstotal.cs_nifree / fs->fs_ncg;
        minndir = fs->fs_ipg;
        mincg = 0;
        for (cg = 0; cg < fs->fs_ncg; cg++) {
                ifree = fs->fs_cs(fs, cg).cs_nifree;
                if (ufs_close_dirs &&
                    (ifree > mininode) &&
                    (fs->fs_cs(fs, cg).cs_nbfree > minbpg)) {
                        return ((ino_t)(fs->fs_ipg * cg));
                }
                if ((fs->fs_cs(fs, cg).cs_ndir < minndir) &&
                    (ifree >= avgifree)) {
                        mincg = cg;
                        minndir = fs->fs_cs(fs, cg).cs_ndir;
                }
        }
        return ((ino_t)(fs->fs_ipg * mincg));
}

/*
 * Select the desired position for the next block in a file.  The file is
 * logically divided into sections. The first section is composed of the
 * direct blocks. Each additional section contains fs_maxbpg blocks.
 *
 * If no blocks have been allocated in the first section, the policy is to
 * request a block in the same cylinder group as the inode that describes
 * the file. If no blocks have been allocated in any other section, the
 * policy is to place the section in a cylinder group with a greater than
 * average number of free blocks.  An appropriate cylinder group is found
 * by using a rotor that sweeps the cylinder groups. When a new group of
 * blocks is needed, the sweep begins in the cylinder group following the
 * cylinder group from which the previous allocation was made. The sweep
 * continues until a cylinder group with greater than the average number
 * of free blocks is found. If the allocation is for the first block in an
 * indirect block, the information on the previous allocation is unavailable;
 * here a best guess is made based upon the logical block number being
 * allocated.
 *
 * If a section is already partially allocated, the policy is to
 * contiguously allocate fs_maxcontig blocks.  The end of one of these
 * contiguous blocks and the beginning of the next is physically separated
 * so that the disk head will be in transit between them for at least
 * fs_rotdelay milliseconds.  This is to allow time for the processor to
 * schedule another I/O transfer.
 */
daddr_t
blkpref(struct inode *ip, daddr_t lbn, int indx, daddr32_t *bap)
{
        struct fs *fs;
        struct ufsvfs *ufsvfsp;
        int cg;
        int avgbfree, startcg;
        daddr_t nextblk;

        ufsvfsp = ip->i_ufsvfs;
        fs = ip->i_fs;
        if (indx % fs->fs_maxbpg == 0 || bap[indx - 1] == 0) {
                if (lbn < NDADDR) {
                        cg = itog(fs, ip->i_number);
                        return (fs->fs_fpg * cg + fs->fs_frag);
                }
                /*
                 * Find a cylinder with greater than average
                 * number of unused data blocks.
                 */
                if (indx == 0 || bap[indx - 1] == 0)
                        startcg = itog(fs, ip->i_number) + lbn / fs->fs_maxbpg;
                else
                        startcg = dtog(fs, bap[indx - 1]) + 1;
                startcg %= fs->fs_ncg;

                mutex_enter(&ufsvfsp->vfs_lock);
                avgbfree = fs->fs_cstotal.cs_nbfree / fs->fs_ncg;
                /*
                 * used for computing log space for writes/truncs
                 */
                ufsvfsp->vfs_avgbfree = avgbfree;
                for (cg = startcg; cg < fs->fs_ncg; cg++)
                        if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                fs->fs_cgrotor = cg;
                                mutex_exit(&ufsvfsp->vfs_lock);
                                return (fs->fs_fpg * cg + fs->fs_frag);
                        }
                for (cg = 0; cg <= startcg; cg++)
                        if (fs->fs_cs(fs, cg).cs_nbfree >= avgbfree) {
                                fs->fs_cgrotor = cg;
                                mutex_exit(&ufsvfsp->vfs_lock);
                                return (fs->fs_fpg * cg + fs->fs_frag);
                        }
                mutex_exit(&ufsvfsp->vfs_lock);
                return (0);
        }
        /*
         * One or more previous blocks have been laid out. If less
         * than fs_maxcontig previous blocks are contiguous, the
         * next block is requested contiguously, otherwise it is
         * requested rotationally delayed by fs_rotdelay milliseconds.
         */

        nextblk = bap[indx - 1];
        /*
         * Provision for fallocate to return positive
         * blk preference based on last allocation
         */
        if (nextblk < 0 && nextblk != UFS_HOLE) {
                nextblk = (-bap[indx - 1]) + fs->fs_frag;
        } else {
                nextblk = bap[indx - 1] + fs->fs_frag;
        }

        if (indx > fs->fs_maxcontig && bap[indx - fs->fs_maxcontig] +
            blkstofrags(fs, fs->fs_maxcontig) != nextblk) {
                return (nextblk);
        }
        if (fs->fs_rotdelay != 0)
                /*
                 * Here we convert ms of delay to frags as:
                 * (frags) = (ms) * (rev/sec) * (sect/rev) /
                 *      ((sect/frag) * (ms/sec))
                 * then round up to the next block.
                 */
                nextblk += roundup(fs->fs_rotdelay * fs->fs_rps * fs->fs_nsect /
                    (NSPF(fs) * 1000), fs->fs_frag);
        return (nextblk);
}

/*
 * Free a block or fragment.
 *
 * The specified block or fragment is placed back in the
 * free map. If a fragment is deallocated, a possible
 * block reassembly is checked.
 */
void
free(struct inode *ip, daddr_t bno, off_t size, int flags)
{
        struct fs *fs = ip->i_fs;
        struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
        struct ufs_q *delq = &ufsvfsp->vfs_delete;
        struct ufs_delq_info *delq_info = &ufsvfsp->vfs_delete_info;
        struct cg *cgp;
        struct buf *bp;
        int cg, bmap, bbase;
        int i;
        uchar_t *blksfree;
        int *blktot;
        short *blks;
        daddr_t blkno, cylno, rpos;

        /*
         * fallocate'd files will have negative block address.
         * So negate it again to get original block address.
         */
        if (bno < 0 && (bno % fs->fs_frag == 0) && bno != UFS_HOLE) {
                bno = -bno;
        }

        if ((unsigned long)size > fs->fs_bsize || fragoff(fs, size) != 0) {
                (void) ufs_fault(ITOV(ip),
                    "free: bad size, dev = 0x%lx, bsize = %d, size = %d, "
                    "fs = %s\n", ip->i_dev, fs->fs_bsize,
                    (int)size, fs->fs_fsmnt);
                return;
        }
        cg = dtog(fs, bno);
        ASSERT(!ufs_badblock(ip, bno));
        bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize);

        cgp = bp->b_un.b_cg;
        if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
                brelse(bp);
                return;
        }

        if (!(flags & I_NOCANCEL))
                TRANS_CANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size, flags);
        if (flags & (I_DIR|I_IBLK|I_SHAD|I_QUOTA)) {
                TRANS_MATA_FREE(ufsvfsp, ldbtob(fsbtodb(fs, bno)), size);
        }
        blksfree = cg_blksfree(cgp);
        blktot = cg_blktot(cgp);
        mutex_enter(&ufsvfsp->vfs_lock);
        cgp->cg_time = gethrestime_sec();
        bno = dtogd(fs, bno);
        if (size == fs->fs_bsize) {
                blkno = fragstoblks(fs, bno);
                cylno = cbtocylno(fs, bno);
                rpos = cbtorpos(ufsvfsp, bno);
                blks = cg_blks(ufsvfsp, cgp, cylno);
                if (!isclrblock(fs, blksfree, blkno)) {
                        mutex_exit(&ufsvfsp->vfs_lock);
                        brelse(bp);
                        (void) ufs_fault(ITOV(ip), "free: freeing free block, "
                            "dev:0x%lx, block:%ld, ino:%lu, fs:%s",
                            ip->i_dev, bno, ip->i_number, fs->fs_fsmnt);
                        return;
                }
                setblock(fs, blksfree, blkno);
                blks[rpos]++;
                blktot[cylno]++;
                cgp->cg_cs.cs_nbfree++;         /* Log below */
                fs->fs_cstotal.cs_nbfree++;
                fs->fs_cs(fs, cg).cs_nbfree++;
                if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
                        mutex_enter(&delq->uq_mutex);
                        delq_info->delq_unreclaimed_blocks -=
                            btodb(fs->fs_bsize);
                        mutex_exit(&delq->uq_mutex);
                }
        } else {
                bbase = bno - fragnum(fs, bno);
                /*
                 * Decrement the counts associated with the old frags
                 */
                bmap = blkmap(fs, blksfree, bbase);
                fragacct(fs, bmap, cgp->cg_frsum, -1);
                /*
                 * Deallocate the fragment
                 */
                for (i = 0; i < numfrags(fs, size); i++) {
                        if (isset(blksfree, bno + i)) {
                                brelse(bp);
                                mutex_exit(&ufsvfsp->vfs_lock);
                                (void) ufs_fault(ITOV(ip),
                                    "free: freeing free frag, "
                                    "dev:0x%lx, blk:%ld, cg:%d, "
                                    "ino:%lu, fs:%s",
                                    ip->i_dev,
                                    bno + i,
                                    cgp->cg_cgx,
                                    ip->i_number,
                                    fs->fs_fsmnt);
                                return;
                        }
                        setbit(blksfree, bno + i);
                }
                cgp->cg_cs.cs_nffree += i;
                fs->fs_cstotal.cs_nffree += i;
                fs->fs_cs(fs, cg).cs_nffree += i;
                if (TRANS_ISTRANS(ufsvfsp) && (flags & I_ACCT)) {
                        mutex_enter(&delq->uq_mutex);
                        delq_info->delq_unreclaimed_blocks -=
                            btodb(i * fs->fs_fsize);
                        mutex_exit(&delq->uq_mutex);
                }
                /*
                 * Add back in counts associated with the new frags
                 */
                bmap = blkmap(fs, blksfree, bbase);
                fragacct(fs, bmap, cgp->cg_frsum, 1);
                /*
                 * If a complete block has been reassembled, account for it
                 */
                blkno = fragstoblks(fs, bbase);
                if (isblock(fs, blksfree, blkno)) {
                        cylno = cbtocylno(fs, bbase);
                        rpos = cbtorpos(ufsvfsp, bbase);
                        blks = cg_blks(ufsvfsp, cgp, cylno);
                        blks[rpos]++;
                        blktot[cylno]++;
                        cgp->cg_cs.cs_nffree -= fs->fs_frag;
                        fs->fs_cstotal.cs_nffree -= fs->fs_frag;
                        fs->fs_cs(fs, cg).cs_nffree -= fs->fs_frag;
                        cgp->cg_cs.cs_nbfree++;
                        fs->fs_cstotal.cs_nbfree++;
                        fs->fs_cs(fs, cg).cs_nbfree++;
                }
        }
        fs->fs_fmod = 1;
        ufs_notclean(ufsvfsp);
        TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
        TRANS_SI(ufsvfsp, fs, cg);
        bdrwrite(bp);
}

/*
 * Free an inode.
 *
 * The specified inode is placed back in the free map.
 */
void
ufs_ifree(struct inode *ip, ino_t ino, mode_t mode)
{
        struct fs *fs = ip->i_fs;
        struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
        struct cg *cgp;
        struct buf *bp;
        unsigned int inot;
        int cg;
        char *iused;

        if (ip->i_number == ino && ip->i_mode != 0) {
                (void) ufs_fault(ITOV(ip),
                    "ufs_ifree: illegal mode: (imode) %o, (omode) %o, ino %d, "
                    "fs = %s\n",
                    ip->i_mode, mode, (int)ip->i_number, fs->fs_fsmnt);
                return;
        }
        if (ino >= fs->fs_ipg * fs->fs_ncg) {
                (void) ufs_fault(ITOV(ip),
                    "ifree: range, dev = 0x%x, ino = %d, fs = %s\n",
                    (int)ip->i_dev, (int)ino, fs->fs_fsmnt);
                return;
        }
        cg = (int)itog(fs, ino);
        bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize);

        cgp = bp->b_un.b_cg;
        if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
                brelse(bp);
                return;
        }
        mutex_enter(&ufsvfsp->vfs_lock);
        cgp->cg_time = gethrestime_sec();
        iused = cg_inosused(cgp);
        inot = (unsigned int)(ino % (ulong_t)fs->fs_ipg);
        if (isclr(iused, inot)) {
                mutex_exit(&ufsvfsp->vfs_lock);
                brelse(bp);
                (void) ufs_fault(ITOV(ip), "ufs_ifree: freeing free inode, "
                    "mode: (imode) %o, (omode) %o, ino:%d, "
                    "fs:%s",
                    ip->i_mode, mode, (int)ino, fs->fs_fsmnt);
                return;
        }
        clrbit(iused, inot);

        if (inot < (ulong_t)cgp->cg_irotor)
                cgp->cg_irotor = inot;
        cgp->cg_cs.cs_nifree++;
        fs->fs_cstotal.cs_nifree++;
        fs->fs_cs(fs, cg).cs_nifree++;
        if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
                cgp->cg_cs.cs_ndir--;
                fs->fs_cstotal.cs_ndir--;
                fs->fs_cs(fs, cg).cs_ndir--;
        }
        fs->fs_fmod = 1;
        ufs_notclean(ufsvfsp);
        TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
        TRANS_SI(ufsvfsp, fs, cg);
        bdrwrite(bp);
}

/*
 * Implement the cylinder overflow algorithm.
 *
 * The policy implemented by this algorithm is:
 *   1) allocate the block in its requested cylinder group.
 *   2) quadratically rehash on the cylinder group number.
 *   3) brute force search for a free block.
 * The size parameter means size for data blocks, mode for inodes.
 */
static ino_t
hashalloc(struct inode *ip, int cg, long pref, int size, ulong_t (*allocator)())
{
        struct fs *fs;
        int i;
        long result;
        int icg = cg;

        fs = ip->i_fs;
        /*
         * 1: preferred cylinder group
         */
        result = (*allocator)(ip, cg, pref, size);
        if (result)
                return (result);
        /*
         * 2: quadratic rehash
         */
        for (i = 1; i < fs->fs_ncg; i *= 2) {
                cg += i;
                if (cg >= fs->fs_ncg)
                        cg -= fs->fs_ncg;
                result = (*allocator)(ip, cg, 0, size);
                if (result)
                        return (result);
        }
        /*
         * 3: brute force search
         * Note that we start at i == 2, since 0 was checked initially,
         * and 1 is always checked in the quadratic rehash.
         */
        cg = (icg + 2) % fs->fs_ncg;
        for (i = 2; i < fs->fs_ncg; i++) {
                result = (*allocator)(ip, cg, 0, size);
                if (result)
                        return (result);
                cg++;
                if (cg == fs->fs_ncg)
                        cg = 0;
        }
        return (0);
}

/*
 * Determine whether a fragment can be extended.
 *
 * Check to see if the necessary fragments are available, and
 * if they are, allocate them.
 */
static daddr_t
fragextend(struct inode *ip, int cg, long bprev, int osize, int nsize)
{
        struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
        struct fs *fs = ip->i_fs;
        struct buf *bp;
        struct cg *cgp;
        uchar_t *blksfree;
        long bno;
        int frags, bbase;
        int i, j;

        if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, nsize - osize))
                return (0);
        frags = numfrags(fs, nsize);
        bbase = (int)fragnum(fs, bprev);
        if (bbase > fragnum(fs, (bprev + frags - 1))) {
                /* cannot extend across a block boundary */
                return (0);
        }

        bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize);
        cgp = bp->b_un.b_cg;
        if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
                brelse(bp);
                return (0);
        }

        blksfree = cg_blksfree(cgp);
        mutex_enter(&ufsvfsp->vfs_lock);
        bno = dtogd(fs, bprev);
        for (i = numfrags(fs, osize); i < frags; i++) {
                if (isclr(blksfree, bno + i)) {
                        mutex_exit(&ufsvfsp->vfs_lock);
                        brelse(bp);
                        return (0);
                }
                if ((TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, bprev + i)),
                    fs->fs_fsize))) {
                        mutex_exit(&ufsvfsp->vfs_lock);
                        brelse(bp);
                        return (0);
                }
        }

        cgp->cg_time = gethrestime_sec();
        /*
         * The current fragment can be extended,
         * deduct the count on fragment being extended into
         * increase the count on the remaining fragment (if any)
         * allocate the extended piece.
         */
        for (i = frags; i < fs->fs_frag - bbase; i++)
                if (isclr(blksfree, bno + i))
                        break;
        j = i - numfrags(fs, osize);
        cgp->cg_frsum[j]--;
        ASSERT(cgp->cg_frsum[j] >= 0);
        if (i != frags)
                cgp->cg_frsum[i - frags]++;
        for (i = numfrags(fs, osize); i < frags; i++) {
                clrbit(blksfree, bno + i);
                cgp->cg_cs.cs_nffree--;
                fs->fs_cs(fs, cg).cs_nffree--;
                fs->fs_cstotal.cs_nffree--;
        }
        fs->fs_fmod = 1;
        ufs_notclean(ufsvfsp);
        TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
        TRANS_SI(ufsvfsp, fs, cg);
        bdrwrite(bp);
        return ((daddr_t)bprev);
}

/*
 * Determine whether a block can be allocated.
 *
 * Check to see if a block of the apprpriate size
 * is available, and if it is, allocate it.
 */
static daddr_t
alloccg(struct inode *ip, int cg, daddr_t bpref, int size)
{
        struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
        struct fs *fs = ip->i_fs;
        struct buf *bp;
        struct cg *cgp;
        uchar_t *blksfree;
        int bno, frags;
        int allocsiz;
        int i;

        /*
         * Searching for space could be time expensive so do some
         * up front checking to verify that there is actually space
         * available (free blocks or free frags).
         */
        if (fs->fs_cs(fs, cg).cs_nbfree == 0) {
                if (size == fs->fs_bsize)
                        return (0);

                /*
                 * If there are not enough free frags then return.
                 */
                if (fs->fs_cs(fs, cg).cs_nffree < numfrags(fs, size))
                        return (0);
        }

        bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize);

        cgp = bp->b_un.b_cg;
        if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
            (cgp->cg_cs.cs_nbfree == 0 && size == fs->fs_bsize)) {
                brelse(bp);
                return (0);
        }
        blksfree = cg_blksfree(cgp);
        mutex_enter(&ufsvfsp->vfs_lock);
        cgp->cg_time = gethrestime_sec();
        if (size == fs->fs_bsize) {
                if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
                        goto errout;
                fs->fs_fmod = 1;
                ufs_notclean(ufsvfsp);
                TRANS_SI(ufsvfsp, fs, cg);
                bdrwrite(bp);
                return (bno);
        }
        /*
         * Check fragment bitmap to see if any fragments are already available.
         * mapsearch() may fail because the fragment that fits this request
         * might still be on the cancel list and not available for re-use yet.
         * Look for a bigger sized fragment to allocate first before we have
         * to give up and fragment a whole new block eventually.
         */
        frags = numfrags(fs, size);
        allocsiz = frags;
next_size:
        for (; allocsiz < fs->fs_frag; allocsiz++)
                if (cgp->cg_frsum[allocsiz] != 0)
                        break;

        if (allocsiz != fs->fs_frag) {
                bno = mapsearch(ufsvfsp, cgp, bpref, allocsiz);
                if (bno < 0 && allocsiz < (fs->fs_frag - 1)) {
                        allocsiz++;
                        goto next_size;
                }
        }

        if (allocsiz == fs->fs_frag || bno < 0) {
                /*
                 * No fragments were available, so a block
                 * will be allocated and hacked up.
                 */
                if (cgp->cg_cs.cs_nbfree == 0)
                        goto errout;
                if ((bno = alloccgblk(ufsvfsp, cgp, bpref, bp)) == 0)
                        goto errout;
                bpref = dtogd(fs, bno);
                for (i = frags; i < fs->fs_frag; i++)
                        setbit(blksfree, bpref + i);
                i = fs->fs_frag - frags;
                cgp->cg_cs.cs_nffree += i;
                fs->fs_cstotal.cs_nffree += i;
                fs->fs_cs(fs, cg).cs_nffree += i;
                cgp->cg_frsum[i]++;
                fs->fs_fmod = 1;
                ufs_notclean(ufsvfsp);
                TRANS_SI(ufsvfsp, fs, cg);
                bdrwrite(bp);
                return (bno);
        }

        for (i = 0; i < frags; i++)
                clrbit(blksfree, bno + i);
        cgp->cg_cs.cs_nffree -= frags;
        fs->fs_cstotal.cs_nffree -= frags;
        fs->fs_cs(fs, cg).cs_nffree -= frags;
        cgp->cg_frsum[allocsiz]--;
        ASSERT(cgp->cg_frsum[allocsiz] >= 0);
        if (frags != allocsiz) {
                cgp->cg_frsum[allocsiz - frags]++;
        }
        fs->fs_fmod = 1;
        ufs_notclean(ufsvfsp);
        TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
        TRANS_SI(ufsvfsp, fs, cg);
        bdrwrite(bp);
        return (cg * fs->fs_fpg + bno);
errout:
        mutex_exit(&ufsvfsp->vfs_lock);
        brelse(bp);
        return (0);
}

/*
 * Allocate a block in a cylinder group.
 *
 * This algorithm implements the following policy:
 *   1) allocate the requested block.
 *   2) allocate a rotationally optimal block in the same cylinder.
 *   3) allocate the next available block on the block rotor for the
 *      specified cylinder group.
 * Note that this routine only allocates fs_bsize blocks; these
 * blocks may be fragmented by the routine that allocates them.
 */
static daddr_t
alloccgblk(
        struct ufsvfs *ufsvfsp,
        struct cg *cgp,
        daddr_t bpref,
        struct buf *bp)
{
        daddr_t bno;
        int cylno, pos, delta, rotbl_size;
        short *cylbp;
        int i;
        struct fs *fs;
        uchar_t *blksfree;
        daddr_t blkno, rpos, frag;
        short *blks;
        int32_t *blktot;

        ASSERT(MUTEX_HELD(&ufsvfsp->vfs_lock));
        fs = ufsvfsp->vfs_fs;
        blksfree = cg_blksfree(cgp);
        if (bpref == 0) {
                bpref = cgp->cg_rotor;
                goto norot;
        }
        bpref = blknum(fs, bpref);
        bpref = dtogd(fs, bpref);
        /*
         * If the requested block is available, use it.
         */
        if (isblock(fs, blksfree, (daddr_t)fragstoblks(fs, bpref))) {
                bno = bpref;
                goto gotit;
        }
        /*
         * Check for a block available on the same cylinder.
         */
        cylno = cbtocylno(fs, bpref);
        if (cg_blktot(cgp)[cylno] == 0)
                goto norot;
        if (fs->fs_cpc == 0) {
                /*
                 * Block layout info is not available, so just
                 * have to take any block in this cylinder.
                 */
                bpref = howmany(fs->fs_spc * cylno, NSPF(fs));
                goto norot;
        }
        /*
         * Check the summary information to see if a block is
         * available in the requested cylinder starting at the
         * requested rotational position and proceeding around.
         */
        cylbp = cg_blks(ufsvfsp, cgp, cylno);
        pos = cbtorpos(ufsvfsp, bpref);
        for (i = pos; i < ufsvfsp->vfs_nrpos; i++)
                if (cylbp[i] > 0)
                        break;
        if (i == ufsvfsp->vfs_nrpos)
                for (i = 0; i < pos; i++)
                        if (cylbp[i] > 0)
                                break;
        if (cylbp[i] > 0) {
                /*
                 * Found a rotational position, now find the actual
                 * block.  A "panic" if none is actually there.
                 */

                /*
                 * Up to this point, "pos" has referred to the rotational
                 * position of the desired block.  From now on, it holds
                 * the offset of the current cylinder within a cylinder
                 * cycle.  (A cylinder cycle refers to a set of cylinders
                 * which are described by a single rotational table; the
                 * size of the cycle is fs_cpc.)
                 *
                 * bno is set to the block number of the first block within
                 * the current cylinder cycle.
                 */

                pos = cylno % fs->fs_cpc;
                bno = (cylno - pos) * fs->fs_spc / NSPB(fs);

                /*
                 * The blocks within a cylinder are grouped into equivalence
                 * classes according to their "rotational position."  There
                 * are two tables used to determine these classes.
                 *
                 * The positional offset table (fs_postbl) has an entry for
                 * each rotational position of each cylinder in a cylinder
                 * cycle.  This entry contains the relative block number
                 * (counting from the start of the cylinder cycle) of the
                 * first block in the equivalence class for that position
                 * and that cylinder.  Positions for which no blocks exist
                 * are indicated by a -1.
                 *
                 * The rotational delta table (fs_rotbl) has an entry for
                 * each block in a cylinder cycle.  This entry contains
                 * the offset from that block to the next block in the
                 * same equivalence class.  The last block in the class
                 * is indicated by a zero in the table.
                 *
                 * The following code, then, walks through all of the blocks
                 * in the cylinder (cylno) which we're allocating within
                 * which are in the equivalence class for the rotational
                 * position (i) which we're allocating within.
                 */

                if (fs_postbl(ufsvfsp, pos)[i] == -1) {
                        (void) ufs_fault(ufsvfsp->vfs_root,
                            "alloccgblk: cyl groups corrupted, pos = %d, "
                            "i = %d, fs = %s\n", pos, i, fs->fs_fsmnt);
                        return (0);
                }

                /*
                 * There is one entry in the rotational table for each block
                 * in the cylinder cycle.  These are whole blocks, not frags.
                 */

                rotbl_size = (fs->fs_cpc * fs->fs_spc) >>
                    (fs->fs_fragshift + fs->fs_fsbtodb);

                /*
                 * As we start, "i" is the rotational position within which
                 * we're searching.  After the next line, it will be a block
                 * number (relative to the start of the cylinder cycle)
                 * within the equivalence class of that rotational position.
                 */

                i = fs_postbl(ufsvfsp, pos)[i];

                for (;;) {
                        if (isblock(fs, blksfree, (daddr_t)(bno + i))) {
                                bno = blkstofrags(fs, (bno + i));
                                goto gotit;
                        }
                        delta = fs_rotbl(fs)[i];
                        if (delta <= 0 ||               /* End of chain, or */
                            delta + i > rotbl_size)     /* end of table? */
                                break;                  /* If so, panic. */
                        i += delta;
                }
                (void) ufs_fault(ufsvfsp->vfs_root,
                    "alloccgblk: can't find blk in cyl, pos:%d, i:%d, "
                    "fs:%s bno: %x\n", pos, i, fs->fs_fsmnt, (int)bno);
                return (0);
        }
norot:
        /*
         * No blocks in the requested cylinder, so take
         * next available one in this cylinder group.
         */
        bno = mapsearch(ufsvfsp, cgp, bpref, (int)fs->fs_frag);
        if (bno < 0)
                return (0);
        cgp->cg_rotor = bno;
gotit:
        blkno = fragstoblks(fs, bno);
        frag = (cgp->cg_cgx * fs->fs_fpg) + bno;
        if (TRANS_ISCANCEL(ufsvfsp, ldbtob(fsbtodb(fs, frag)), fs->fs_bsize))
                goto norot;
        clrblock(fs, blksfree, (long)blkno);
        /*
         * the other cg/sb/si fields are TRANS'ed by the caller
         */
        cgp->cg_cs.cs_nbfree--;
        fs->fs_cstotal.cs_nbfree--;
        fs->fs_cs(fs, cgp->cg_cgx).cs_nbfree--;
        cylno = cbtocylno(fs, bno);
        blks = cg_blks(ufsvfsp, cgp, cylno);
        rpos = cbtorpos(ufsvfsp, bno);
        blktot = cg_blktot(cgp);
        blks[rpos]--;
        blktot[cylno]--;
        TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
        fs->fs_fmod = 1;
        return (frag);
}

/*
 * Determine whether an inode can be allocated.
 *
 * Check to see if an inode is available, and if it is,
 * allocate it using the following policy:
 *   1) allocate the requested inode.
 *   2) allocate the next available inode after the requested
 *      inode in the specified cylinder group.
 */
static ino_t
ialloccg(struct inode *ip, int cg, daddr_t ipref, int mode)
{
        struct ufsvfs *ufsvfsp = ip->i_ufsvfs;
        struct fs *fs = ip->i_fs;
        struct cg *cgp;
        struct buf *bp;
        int start, len, loc, map, i;
        char *iused;

        if (fs->fs_cs(fs, cg).cs_nifree == 0)
                return (0);
        bp = UFS_BREAD(ufsvfsp, ip->i_dev, (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
            (int)fs->fs_cgsize);

        cgp = bp->b_un.b_cg;
        if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp) ||
            cgp->cg_cs.cs_nifree == 0) {
                brelse(bp);
                return (0);
        }
        iused = cg_inosused(cgp);
        mutex_enter(&ufsvfsp->vfs_lock);
        /*
         * While we are waiting for the mutex, someone may have taken
         * the last available inode.  Need to recheck.
         */
        if (cgp->cg_cs.cs_nifree == 0) {
                mutex_exit(&ufsvfsp->vfs_lock);
                brelse(bp);
                return (0);
        }

        cgp->cg_time = gethrestime_sec();
        if (ipref) {
                ipref %= fs->fs_ipg;
                if (isclr(iused, ipref))
                        goto gotit;
        }
        start = cgp->cg_irotor / NBBY;
        len = howmany(fs->fs_ipg - cgp->cg_irotor, NBBY);
        loc = skpc(0xff, (uint_t)len, &iused[start]);
        if (loc == 0) {
                len = start + 1;
                start = 0;
                loc = skpc(0xff, (uint_t)len, &iused[0]);
                if (loc == 0) {
                        mutex_exit(&ufsvfsp->vfs_lock);
                        (void) ufs_fault(ITOV(ip),
                            "ialloccg: map corrupted, cg = %d, irotor = %d, "
                            "fs = %s\n", cg, (int)cgp->cg_irotor, fs->fs_fsmnt);
                        return (0);
                }
        }
        i = start + len - loc;
        map = iused[i];
        ipref = i * NBBY;
        for (i = 1; i < (1 << NBBY); i <<= 1, ipref++) {
                if ((map & i) == 0) {
                        cgp->cg_irotor = ipref;
                        goto gotit;
                }
        }

        mutex_exit(&ufsvfsp->vfs_lock);
        (void) ufs_fault(ITOV(ip), "ialloccg: block not in mapfs = %s",
            fs->fs_fsmnt);
        return (0);
gotit:
        setbit(iused, ipref);
        cgp->cg_cs.cs_nifree--;
        fs->fs_cstotal.cs_nifree--;
        fs->fs_cs(fs, cg).cs_nifree--;
        if (((mode & IFMT) == IFDIR) || ((mode & IFMT) == IFATTRDIR)) {
                cgp->cg_cs.cs_ndir++;
                fs->fs_cstotal.cs_ndir++;
                fs->fs_cs(fs, cg).cs_ndir++;
        }
        fs->fs_fmod = 1;
        ufs_notclean(ufsvfsp);
        TRANS_BUF(ufsvfsp, 0, fs->fs_cgsize, bp, DT_CG);
        TRANS_SI(ufsvfsp, fs, cg);
        bdrwrite(bp);
        return (cg * fs->fs_ipg + ipref);
}

/*
 * Find a block of the specified size in the specified cylinder group.
 *
 * It is a panic if a request is made to find a block if none are
 * available.
 */
static daddr_t
mapsearch(struct ufsvfs *ufsvfsp, struct cg *cgp, daddr_t bpref, int allocsiz)
{
        struct fs *fs   = ufsvfsp->vfs_fs;
        daddr_t bno, cfrag;
        int start, len, loc, i, last, first, secondtime;
        int blk, field, subfield, pos;
        int gotit;

        /*
         * ufsvfs->vfs_lock is held when calling this.
         */
        /*
         * Find the fragment by searching through the
         * free block map for an appropriate bit pattern.
         */
        if (bpref)
                start = dtogd(fs, bpref) / NBBY;
        else
                start = cgp->cg_frotor / NBBY;
        /*
         * the following loop performs two scans -- the first scan
         * searches the bottom half of the array for a match and the
         * second scan searches the top half of the array.  The loops
         * have been merged just to make things difficult.
         */
        first = start;
        last = howmany(fs->fs_fpg, NBBY);
        secondtime = 0;
        cfrag = cgp->cg_cgx * fs->fs_fpg;
        while (first < last) {
                len = last - first;
                /*
                 * search the array for a match
                 */
                loc = scanc((unsigned)len, (uchar_t *)&cg_blksfree(cgp)[first],
                    (uchar_t *)fragtbl[fs->fs_frag],
                    (int)(1 << (allocsiz - 1 + (fs->fs_frag % NBBY))));
                /*
                 * match found
                 */
                if (loc) {
                        bno = (last - loc) * NBBY;

                        /*
                         * Found the byte in the map, sift
                         * through the bits to find the selected frag
                         */
                        cgp->cg_frotor = bno;
                        gotit = 0;
                        for (i = bno + NBBY; bno < i; bno += fs->fs_frag) {
                                blk = blkmap(fs, cg_blksfree(cgp), bno);
                                blk <<= 1;
                                field = around[allocsiz];
                                subfield = inside[allocsiz];
                                for (pos = 0;
                                    pos <= fs->fs_frag - allocsiz;
                                    pos++) {
                                        if ((blk & field) == subfield) {
                                                gotit++;
                                                break;
                                        }
                                        field <<= 1;
                                        subfield <<= 1;
                                }
                                if (gotit)
                                        break;
                        }
                        bno += pos;

                        /*
                         * success if block is *not* being converted from
                         * metadata into userdata (harpy).  If so, ignore.
                         */
                        if (!TRANS_ISCANCEL(ufsvfsp,
                            ldbtob(fsbtodb(fs, (cfrag+bno))),
                            allocsiz * fs->fs_fsize))
                                return (bno);

                        /*
                         * keep looking -- this block is being converted
                         */
                        first = (last - loc) + 1;
                        loc = 0;
                        if (first < last)
                                continue;
                }
                /*
                 * no usable matches in bottom half -- now search the top half
                 */
                if (secondtime)
                        /*
                         * no usable matches in top half -- all done
                         */
                        break;
                secondtime = 1;
                last = start + 1;
                first = 0;
        }
        /*
         * no usable matches
         */
        return ((daddr_t)-1);
}

#define UFSNADDR (NDADDR + NIADDR)      /* NADDR applies to (obsolete) S5FS */
#define IB(i)   (NDADDR + (i))  /* index of i'th indirect block ptr */
#define SINGLE  0               /* single indirect block ptr */
#define DOUBLE  1               /* double indirect block ptr */
#define TRIPLE  2               /* triple indirect block ptr */

/*
 * Acquire a write lock, and keep trying till we get it
 */
static int
allocsp_wlockfs(struct vnode *vp, struct lockfs *lf)
{
        int err = 0;

lockagain:
        do {
                err = ufs_fiolfss(vp, lf);
                if (err)
                        return (err);
        } while (!LOCKFS_IS_ULOCK(lf));

        lf->lf_lock = LOCKFS_WLOCK;
        lf->lf_flags = 0;
        lf->lf_comment = NULL;
        err = ufs__fiolfs(vp, lf, 1, 0);

        if (err == EBUSY || err == EINVAL)
                goto lockagain;

        return (err);
}

/*
 * Release the write lock
 */
static int
allocsp_unlockfs(struct vnode *vp, struct lockfs *lf)
{
        int err = 0;

        lf->lf_lock = LOCKFS_ULOCK;
        lf->lf_flags = 0;
        err = ufs__fiolfs(vp, lf, 1, 0);
        return (err);
}

struct allocsp_undo {
        daddr_t offset;
        daddr_t blk;
        struct allocsp_undo *next;
};

/*
 * ufs_allocsp() can be used to pre-allocate blocks for a file on a given
 * file system. For direct blocks, the blocks are allocated from the offset
 * requested to the block boundary, then any full blocks are allocated,
 * and finally any remainder.
 * For indirect blocks the blocks are not initialized and are
 * only marked as allocated. These addresses are then stored as negative
 * block numbers in the inode to imply special handling. UFS has been modified
 * where necessary to understand this new notion.
 * Successfully fallocated files will have IFALLOCATE cflag set in the inode.
 */
int
ufs_allocsp(struct vnode *vp, struct flock64 *lp, cred_t *cr)
{
        struct lockfs lf;
        int berr, err, resv, issync;
        off_t istart, len; /* istart, special for idb */
        struct inode *ip;
        struct fs *fs;
        struct ufsvfs *ufsvfsp;
        u_offset_t resid, i, uoff;
        daddr32_t db_undo[NDADDR];      /* old direct blocks */
        struct allocsp_undo *ib_undo = NULL;    /* ib undo */
        struct allocsp_undo *undo = NULL;
        u_offset_t osz;                 /* old file size */
        int chunkblks = 0;              /* # of blocks in 1 allocation */
        int cnt = 0;
        daddr_t allocblk;
        daddr_t totblks = 0;
        struct ulockfs  *ulp;
        size_t done_len;
        int nbytes, offsetn;


        ASSERT(vp->v_type == VREG);

        ip = VTOI(vp);
        fs = ip->i_fs;
        if ((ufsvfsp = ip->i_ufsvfs) == NULL) {
                err = EIO;
                goto out_allocsp;
        }

        istart = blkroundup(fs, (lp->l_start));
        len = blkroundup(fs, (lp->l_len));
        chunkblks = blkroundup(fs, ufsvfsp->vfs_iotransz) / fs->fs_bsize;
        ulp = &ufsvfsp->vfs_ulockfs;

        if (lp->l_start < 0 || lp->l_len <= 0)
                return (EINVAL);

        /* Quickly check to make sure we have space before we proceed */
        if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree) {
                if (TRANS_ISTRANS(ufsvfsp)) {
                        ufs_delete_drain_wait(ufsvfsp, 1);
                        if (lblkno(fs, len) > fs->fs_cstotal.cs_nbfree)
                                return (ENOSPC);
                } else
                        return (ENOSPC);
        }

        /*
         * We will keep i_rwlock locked as WRITER through out the function
         * since we don't want anyone else reading or writing to the inode
         * while we are in the middle of fallocating the file.
         */
        rw_enter(&ip->i_rwlock, RW_WRITER);

        /* Back up the direct block list, used for undo later if necessary */
        rw_enter(&ip->i_contents, RW_READER);
        for (i = 0; i < NDADDR; i++)
                db_undo[i] = ip->i_db[i];
        osz = ip->i_size;
        rw_exit(&ip->i_contents);

        /* Write lock the file system */
        if (err = allocsp_wlockfs(vp, &lf))
                goto exit;

        /*
         * Allocate any direct blocks now.
         * Blocks are allocated from the offset requested to the block
         * boundary, then any full blocks are allocated, and finally any
         * remainder.
         */
        if (lblkno(fs, lp->l_start) < NDADDR) {
                ufs_trans_trunc_resv(ip, ip->i_size + (NDADDR * fs->fs_bsize),
                    &resv, &resid);
                TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);

                rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
                rw_enter(&ip->i_contents, RW_WRITER);

                done_len = 0;
                while ((done_len < lp->l_len) &&
                    (lblkno(fs, lp->l_start + done_len) < NDADDR)) {
                        uoff = (offset_t)(lp->l_start + done_len);
                        offsetn = (int)blkoff(fs, uoff);
                        nbytes = (int)MIN(fs->fs_bsize - offsetn,
                            lp->l_len - done_len);

                        berr = bmap_write(ip, uoff, offsetn + nbytes,
                            BI_FALLOCATE, &allocblk, cr);
                        /* Yikes error, quit */
                        if (berr) {
                                TRANS_INODE(ufsvfsp, ip);
                                rw_exit(&ip->i_contents);
                                rw_exit(&ufsvfsp->vfs_dqrwlock);
                                TRANS_END_CSYNC(ufsvfsp, err, issync,
                                    TOP_ALLOCSP, resv);
                                err = allocsp_unlockfs(vp, &lf);
                                goto exit;
                        }

                        if (allocblk) {
                                totblks++;
                                if ((uoff + nbytes) > ip->i_size)
                                        ip->i_size = (uoff + nbytes);
                        }
                        done_len += nbytes;
                }

                TRANS_INODE(ufsvfsp, ip);
                rw_exit(&ip->i_contents);
                rw_exit(&ufsvfsp->vfs_dqrwlock);
                TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);

                /* start offset for indirect allocation */
                istart =  (uoff + nbytes);
        }

        /* Break the transactions into vfs_iotransz units */
        ufs_trans_trunc_resv(ip, ip->i_size +
            blkroundup(fs, ufsvfsp->vfs_iotransz), &resv, &resid);
        TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);

        rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
        rw_enter(&ip->i_contents, RW_WRITER);

        /* Now go about fallocating necessary indirect blocks */
        for (i = istart; i < (lp->l_start + lp->l_len); i += fs->fs_bsize) {
                berr = bmap_write(ip, i, fs->fs_bsize, BI_FALLOCATE,
                    &allocblk, cr);
                if (berr) {
                        TRANS_INODE(ufsvfsp, ip);
                        rw_exit(&ip->i_contents);
                        rw_exit(&ufsvfsp->vfs_dqrwlock);
                        TRANS_END_CSYNC(ufsvfsp, err, issync,
                            TOP_ALLOCSP, resv);
                        err = allocsp_unlockfs(vp, &lf);
                        goto exit;
                }

                /* Update the blk counter only if new block was added */
                if (allocblk) {
                        /* Save undo information */
                        undo = kmem_alloc(sizeof (struct allocsp_undo),
                            KM_SLEEP);
                        undo->offset = i;
                        undo->blk = allocblk;
                        undo->next = ib_undo;
                        ib_undo = undo;
                        totblks++;

                        if (i >= ip->i_size)
                                ip->i_size += fs->fs_bsize;
                }
                cnt++;

                /* Being a good UFS citizen, let others get a share */
                if (cnt == chunkblks) {
                        /*
                         * If there are waiters or the fs is hard locked,
                         * error locked, or read-only error locked,
                         * quit with EIO
                         */
                        if (ULOCKFS_IS_HLOCK(ulp) || ULOCKFS_IS_ELOCK(ulp) ||
                            ULOCKFS_IS_ROELOCK(ulp)) {
                                ip->i_cflags |= IFALLOCATE;
                                TRANS_INODE(ufsvfsp, ip);
                                rw_exit(&ip->i_contents);
                                rw_exit(&ufsvfsp->vfs_dqrwlock);

                                TRANS_END_CSYNC(ufsvfsp, err, issync,
                                    TOP_ALLOCSP, resv);
                                rw_exit(&ip->i_rwlock);
                                (void) allocsp_unlockfs(vp, &lf);
                                return (EIO);
                        }

                        TRANS_INODE(ufsvfsp, ip);
                        rw_exit(&ip->i_contents);
                        rw_exit(&ufsvfsp->vfs_dqrwlock);

                        /* End the current transaction */
                        TRANS_END_CSYNC(ufsvfsp, err, issync,
                            TOP_ALLOCSP, resv);

                        if (CV_HAS_WAITERS(&ulp->ul_cv)) {
                                /* Release the write lock */
                                if (err = allocsp_unlockfs(vp, &lf))
                                        goto exit;

                                /* Wake up others waiting to do operations */
                                mutex_enter(&ulp->ul_lock);
                                cv_broadcast(&ulp->ul_cv);
                                mutex_exit(&ulp->ul_lock);

                                /* Grab the write lock again */
                                if (err = allocsp_wlockfs(vp, &lf))
                                        goto exit;
                        } /* end of CV_HAS_WAITERS(&ulp->ul_cv) */

                        /* Reserve more space in log for this file */
                        ufs_trans_trunc_resv(ip,
                            ip->i_size + blkroundup(fs, ufsvfsp->vfs_iotransz),
                            &resv, &resid);
                        TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);

                        rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
                        rw_enter(&ip->i_contents, RW_WRITER);

                        cnt = 0;        /* reset cnt b/c of new transaction */
                }
        }

        if (!err && !berr)
                ip->i_cflags |= IFALLOCATE;

        /* If the file has grown then correct the file size */
        if (osz < (lp->l_start + lp->l_len))
                ip->i_size = (lp->l_start + lp->l_len);

        /* Release locks, end log transaction and unlock fs */
        TRANS_INODE(ufsvfsp, ip);
        rw_exit(&ip->i_contents);
        rw_exit(&ufsvfsp->vfs_dqrwlock);

        TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);
        err = allocsp_unlockfs(vp, &lf);

        /*
         * @ exit label, we should no longer be holding the fs write lock, and
         * all logging transactions should have been ended. We still hold
         * ip->i_rwlock.
         */
exit:
        /*
         * File has grown larger than 2GB. Set flag
         * in superblock to indicate this, if it
         * is not already set.
         */
        if ((ip->i_size > MAXOFF32_T) &&
            !(fs->fs_flags & FSLARGEFILES)) {
                ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
                mutex_enter(&ufsvfsp->vfs_lock);
                fs->fs_flags |= FSLARGEFILES;
                ufs_sbwrite(ufsvfsp);
                mutex_exit(&ufsvfsp->vfs_lock);
        }

        /*
         * Since we couldn't allocate completely, we will undo the allocations.
         */
        if (berr) {
                ufs_trans_trunc_resv(ip, totblks * fs->fs_bsize, &resv, &resid);
                TRANS_BEGIN_CSYNC(ufsvfsp, issync, TOP_ALLOCSP, resv);

                rw_enter(&ufsvfsp->vfs_dqrwlock, RW_READER);
                rw_enter(&ip->i_contents, RW_WRITER);

                /* Direct blocks */
                for (i = 0; i < NDADDR; i++) {
                        /*
                         * Only free the block if they are not same, and
                         * the old one isn't zero (the fragment was
                         * re-allocated).
                         */
                        if (db_undo[i] != ip->i_db[i] && db_undo[i] == 0) {
                                free(ip, ip->i_db[i], fs->fs_bsize, 0);
                                ip->i_db[i] = 0;
                        }
                }

                /* Undo the indirect blocks */
                while (ib_undo != NULL) {
                        undo = ib_undo;
                        err = bmap_set_bn(vp, undo->offset, 0);
                        if (err)
                                cmn_err(CE_PANIC, "ufs_allocsp(): failed to "
                                    "undo allocation of block %ld",
                                    undo->offset);
                        free(ip, undo->blk, fs->fs_bsize, I_IBLK);
                        ib_undo = undo->next;
                        kmem_free(undo, sizeof (struct allocsp_undo));
                }

                ip->i_size = osz;
                TRANS_INODE(ufsvfsp, ip);

                rw_exit(&ip->i_contents);
                rw_exit(&ufsvfsp->vfs_dqrwlock);

                TRANS_END_CSYNC(ufsvfsp, err, issync, TOP_ALLOCSP, resv);

                rw_exit(&ip->i_rwlock);
                return (berr);
        }

        /*
         * Don't forget to free the undo chain :)
         */
        while (ib_undo != NULL) {
                undo = ib_undo;
                ib_undo = undo->next;
                kmem_free(undo, sizeof (struct allocsp_undo));
        }

        rw_exit(&ip->i_rwlock);

out_allocsp:
        return (err);
}

/*
 * Free storage space associated with the specified inode.  The portion
 * to be freed is specified by lp->l_start and lp->l_len (already
 * normalized to a "whence" of 0).
 *
 * This is an experimental facility whose continued existence is not
 * guaranteed.  Currently, we only support the special case
 * of l_len == 0, meaning free to end of file.
 *
 * Blocks are freed in reverse order.  This FILO algorithm will tend to
 * maintain a contiguous free list much longer than FIFO.
 * See also ufs_itrunc() in ufs_inode.c.
 *
 * Bug: unused bytes in the last retained block are not cleared.
 * This may result in a "hole" in the file that does not read as zeroes.
 */
/* ARGSUSED */
int
ufs_freesp(struct vnode *vp, struct flock64 *lp, int flag, cred_t *cr)
{
        int i;
        struct inode *ip = VTOI(vp);
        int error;

        ASSERT(vp->v_type == VREG);
        ASSERT(lp->l_start >= 0);       /* checked by convoff */

        if (lp->l_len != 0)
                return (EINVAL);

        rw_enter(&ip->i_contents, RW_READER);
        if (ip->i_size == (u_offset_t)lp->l_start) {
                rw_exit(&ip->i_contents);
                return (0);
        }

        /*
         * Check if there is any active mandatory lock on the
         * range that will be truncated/expanded.
         */
        if (MANDLOCK(vp, ip->i_mode)) {
                offset_t save_start;

                save_start = lp->l_start;

                if (ip->i_size < lp->l_start) {
                        /*
                         * "Truncate up" case: need to make sure there
                         * is no lock beyond current end-of-file. To
                         * do so, we need to set l_start to the size
                         * of the file temporarily.
                         */
                        lp->l_start = ip->i_size;
                }
                lp->l_type = F_WRLCK;
                lp->l_sysid = 0;
                lp->l_pid = ttoproc(curthread)->p_pid;
                i = (flag & (FNDELAY|FNONBLOCK)) ? 0 : SLPFLCK;
                rw_exit(&ip->i_contents);
                if ((i = reclock(vp, lp, i, 0, lp->l_start, NULL)) != 0 ||
                    lp->l_type != F_UNLCK) {
                        return (i ? i : EAGAIN);
                }
                rw_enter(&ip->i_contents, RW_READER);

                lp->l_start = save_start;
        }

        /*
         * Make sure a write isn't in progress (allocating blocks)
         * by acquiring i_rwlock (we promised ufs_bmap we wouldn't
         * truncate while it was allocating blocks).
         * Grab the locks in the right order.
         */
        rw_exit(&ip->i_contents);
        rw_enter(&ip->i_rwlock, RW_WRITER);
        error = TRANS_ITRUNC(ip, (u_offset_t)lp->l_start, 0, cr);
        rw_exit(&ip->i_rwlock);
        return (error);
}

/*
 * Find a cg with as close to nb contiguous bytes as possible
 *      THIS MAY TAKE MANY DISK READS!
 *
 * Implemented in an attempt to allocate contiguous blocks for
 * writing the ufs log file to, minimizing future disk head seeking
 */
daddr_t
contigpref(ufsvfs_t *ufsvfsp, size_t nb, size_t minb)
{
        struct fs       *fs     = ufsvfsp->vfs_fs;
        daddr_t         nblk    = lblkno(fs, blkroundup(fs, nb));
        daddr_t         minblk  = lblkno(fs, blkroundup(fs, minb));
        daddr_t         savebno, curbno, cgbno;
        int             cg, cgblks, savecg, savenblk, curnblk, startcg;
        uchar_t         *blksfree;
        buf_t           *bp;
        struct cg       *cgp;

        savenblk = 0;
        savecg = 0;
        savebno = 0;

        if ((startcg = findlogstartcg(fs, nblk, minblk)) == -1)
                cg = 0; /* Nothing suitable found */
        else
                cg = startcg;

        for (; cg < fs->fs_ncg; ++cg) {
                /*
                 * find the largest contiguous range in this cg
                 */
                bp = UFS_BREAD(ufsvfsp, ufsvfsp->vfs_dev,
                    (daddr_t)fsbtodb(fs, cgtod(fs, cg)),
                    (int)fs->fs_cgsize);
                cgp = bp->b_un.b_cg;
                if (bp->b_flags & B_ERROR || !cg_chkmagic(cgp)) {
                        brelse(bp);
                        continue;
                }
                blksfree = cg_blksfree(cgp);        /* free array */
                cgblks = fragstoblks(fs, fs->fs_fpg); /* blks in free array */
                cgbno = 0;
                while (cgbno < cgblks && savenblk < nblk) {
                        /* find a free block */
                        for (; cgbno < cgblks; ++cgbno) {
                                if (isblock(fs, blksfree, cgbno)) {
                                        if (startcg != -1) {
                                                brelse(bp);
                                                savecg = startcg;
                                                savebno = cgbno;
                                                goto done;
                                        } else
                                                break;
                                }
                        }
                        curbno = cgbno;
                        /* count the number of free blocks */
                        for (curnblk = 0; cgbno < cgblks; ++cgbno) {
                                if (!isblock(fs, blksfree, cgbno))
                                        break;
                                if (++curnblk >= nblk)
                                        break;
                        }
                        if (curnblk > savenblk) {
                                savecg = cg;
                                savenblk = curnblk;
                                savebno = curbno;
                        }
                }
                brelse(bp);
                if (savenblk >= nblk)
                        break;
        }

done:

        /* convert block offset in cg to frag offset in cg */
        savebno = blkstofrags(fs, savebno);

        /* convert frag offset in cg to frag offset in fs */
        savebno += (savecg * fs->fs_fpg);

        return (savebno);
}

/*
 * The object of this routine is to find a start point for the UFS log.
 * Ideally the space should be allocated from the smallest possible number
 * of contiguous cylinder groups. This is found by using a sliding window
 * technique. The smallest window of contiguous cylinder groups, which is
 * still able to accommodate the target, is found by moving the window
 * through the cylinder groups in a single pass. The end of the window is
 * advanced until the space is accommodated, then the start is advanced until
 * it no longer fits, the end is then advanced again and so on until the
 * final cylinder group is reached. The first suitable instance is recorded
 * and its starting cg number is returned.
 *
 * If we are not able to find a minimum amount of space, represented by
 * minblk, or to do so uses more than the available extents, then return -1.
 */

int
findlogstartcg(struct fs *fs, daddr_t requested, daddr_t minblk)
{
        int      ncgs;           /* number of cylinder groups */
        daddr_t target;          /* amount of space sought */
        int      cwidth, ctotal; /* current window width and total */
        int      bwidth, btotal; /* best window width and total so far */
        int      s;     /* index of the first element in the current window */
        int      e;     /* index of the first element + the width */
                        /*  (i.e. 1 + index of last element) */
        int      bs; /* index of the first element in the best window so far */
        int      header, max_extents;

        target = requested;
        ncgs = fs->fs_ncg;

        header = sizeof (extent_block_t) - sizeof (extent_t);
        max_extents = ((fs->fs_bsize)-header) / sizeof (extent_t);
        cwidth = ctotal = 0;
        btotal = -1;
        bwidth = ncgs;
        s = e = 0;
        while (e < ncgs) {
        /* Advance the end of the window until it accommodates the target. */
                while (ctotal < target && e < ncgs) {
                        ctotal += fs->fs_cs(fs, e).cs_nbfree;
                        e++;
                }

                /*
                 * Advance the start of the window until it no longer
                 * accommodates the target.
                 */
                while (ctotal >= target && s < e) {
                        /* See if this is the smallest window so far. */
                        cwidth = e - s;
                        if (cwidth <= bwidth) {
                                if (cwidth == bwidth && ctotal <= btotal)
                                        goto more;
                                bwidth = cwidth;
                                btotal = ctotal;
                                bs = s;
                        }
more:
                        ctotal -= fs->fs_cs(fs, s).cs_nbfree;
                        s++;
                }
        }

        /*
         * If we cannot allocate the minimum required or we use too many
         * extents to do so, return -1.
         */
        if (btotal < minblk || bwidth > max_extents)
                bs = -1;

        return (bs);
}