root/usr/src/uts/common/io/fssnap.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */


#include <sys/debug.h>
#include <sys/types.h>
#include <sys/file.h>
#include <sys/errno.h>
#include <sys/uio.h>
#include <sys/open.h>
#include <sys/cred.h>
#include <sys/kmem.h>
#include <sys/conf.h>
#include <sys/cmn_err.h>
#include <sys/modctl.h>
#include <sys/disp.h>
#include <sys/atomic.h>
#include <sys/filio.h>
#include <sys/stat.h> /* needed for S_IFBLK and S_IFCHR */
#include <sys/kstat.h>

#include <sys/ddi.h>
#include <sys/devops.h>
#include <sys/sunddi.h>
#include <sys/esunddi.h>
#include <sys/priv_names.h>

#include <sys/fssnap.h>
#include <sys/fssnap_if.h>

/*
 * This module implements the file system snapshot code, which provides a
 * point-in-time image of a file system for the purposes of online backup.
 * There are essentially two parts to this project: the driver half and the
 * file system half.  The driver half is a pseudo device driver called
 * "fssnap" that represents the snapshot.  Each snapshot is assigned a
 * number that corresponds to the minor number of the device, and a control
 * device with a high minor number is used to initiate snapshot creation and
 * deletion.  For all practical purposes the driver half acts like a
 * read-only disk device whose contents are exactly the same as the master
 * file system at the time the snapshot was created.
 *
 * The file system half provides interfaces necessary for performing the
 * file system dependent operations required to create and delete snapshots
 * and a special driver strategy routine that must always be used by the file
 * system for snapshots to work correctly.
 *
 * When a snapshot is to be created, the user utility will send an ioctl to
 * the control device of the driver half specifying the file system to be
 * snapshotted, the file descriptor of a backing-store file which is used to
 * hold old data before it is overwritten, and other snapshot parameters.
 * This ioctl is passed on to the file system specified in the original
 * ioctl request.  The file system is expected to be able to flush
 * everything out to make the file system consistent and lock it to ensure
 * no changes occur while the snapshot is being created.  It then calls
 * fssnap_create() to create state for a new snapshot, from which an opaque
 * handle is returned with the snapshot locked.  Next, the file system must
 * populate the "candidate bitmap", which tells the snapshot code which
 * "chunks" should be considered for copy-on-write (a chunk is the unit of
 * granularity used for copy-on-write, which is independent of the device
 * and file system block sizes).  This is typically done by scanning the
 * file system allocation bitmaps to determine which chunks contain
 * allocated blocks in the file system at the time the snapshot was created.
 * If a chunk has no allocated blocks, it does not need to be copied before
 * being written to.  Once the candidate bitmap is populated with
 * fssnap_set_candidate(), the file system calls fssnap_create_done() to
 * complete the snapshot creation and unlock the snapshot.  The file system
 * may now be unlocked and modifications to it resumed.
 *
 * Once a snapshot is created, the file system must perform all writes
 * through a special strategy routine, fssnap_strategy().  This strategy
 * routine determines whether the chunks contained by the write must be
 * copied before being overwritten by consulting the candidate bitmap
 * described above, and the "hastrans bitmap" which tells it whether the chunk
 * has been copied already or not.  If the chunk is a candidate but has not
 * been copied, it reads the old data in and adds it to a queue.  The
 * old data can then be overwritten with the new data.  An asynchronous
 * task queue is dispatched for each old chunk read in which writes the old
 * data to the backing file specified at snapshot creation time.  The
 * backing file is a sparse file the same size as the file system that
 * contains the old data at the offset that data originally had in the
 * file system.  If the queue containing in-memory chunks gets too large,
 * writes to the file system may be throttled by a semaphore until the
 * task queues have a chance to push some of the chunks to the backing file.
 *
 * With the candidate bitmap, the hastrans bitmap, the data on the master
 * file system, and the old data in memory and in the backing file, the
 * snapshot pseudo-driver can piece together the original file system
 * information to satisfy read requests.  If the requested chunk is not a
 * candidate, it returns a zeroed buffer.  If the chunk is a candidate but
 * has not been copied it reads it from the master file system.  If it is a
 * candidate and has been copied, it either copies the data from the
 * in-memory queue or it reads it in from the backing file.  The result is
 * a replication of the original file system that can be backed up, mounted,
 * or manipulated by other file system utilities that work on a read-only
 * device.
 *
 * This module is divided into three roughly logical sections:
 *
 *     - The snapshot driver, which is a character/block driver
 *       representing the snapshot itself.  These routines are
 *       prefixed with "snap_".
 *
 *     - The library routines that are defined in fssnap_if.h that
 *       are used by file systems that use this snapshot implementation.
 *       These functions are prefixed with "fssnap_" and are called through
 *       a function vector from the file system.
 *
 *     - The helper routines used by the snapshot driver and the fssnap
 *       library routines for managing the translation table and other
 *       useful functions.  These routines are all static and are
 *       prefixed with either "fssnap_" or "transtbl_" if they
 *       are specifically used for translation table activities.
 */

static dev_info_t               *fssnap_dip = NULL;
static struct snapshot_id       *snapshot = NULL;
static struct snapshot_id       snap_ctl;
static int                      num_snapshots = 0;
static kmutex_t                 snapshot_mutex;
static char                     snapname[] = SNAP_NAME;

/* "tunable" parameters */
static int              fssnap_taskq_nthreads = FSSNAP_TASKQ_THREADS;
static uint_t           fssnap_max_mem_chunks = FSSNAP_MAX_MEM_CHUNKS;
static int              fssnap_taskq_maxtasks = FSSNAP_TASKQ_MAXTASKS;

/* static function prototypes */

/* snapshot driver */
static int snap_getinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
static int snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
static int snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
static int snap_open(dev_t *devp, int flag, int otyp, cred_t *cred);
static int snap_close(dev_t dev, int flag, int otyp, cred_t *cred);
static int snap_strategy(struct buf *bp);
static int snap_read(dev_t dev, struct uio *uiop, cred_t *credp);
static int snap_print(dev_t dev, char *str);
static int snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode,
    cred_t *credp, int *rvalp);
static int snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
    int flags, char *name, caddr_t valuep, int *lengthp);
static int snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk,
    int offset, int len, char *buffer);


/* fssnap interface implementations (see fssnap_if.h) */
static void fssnap_strategy_impl(void *, struct buf *);
static void *fssnap_create_impl(chunknumber_t, uint_t, u_offset_t,
    struct vnode *, int, struct vnode **, char *, u_offset_t);
static void fssnap_set_candidate_impl(void *, chunknumber_t);
static int fssnap_is_candidate_impl(void *, u_offset_t);
static int fssnap_create_done_impl(void *);
static int fssnap_delete_impl(void *);

/* fssnap interface support routines */
static int  fssnap_translate(struct snapshot_id **, struct buf *);
static void fssnap_write_taskq(void *);
static void fssnap_create_kstats(snapshot_id_t *, int, const char *,
    const char *);
static int  fssnap_update_kstat_num(kstat_t *, int);
static void fssnap_delete_kstats(struct cow_info *);

/* translation table prototypes */
static cow_map_node_t *transtbl_add(cow_map_t *, chunknumber_t, caddr_t);
static cow_map_node_t *transtbl_get(cow_map_t *, chunknumber_t);
static void transtbl_delete(cow_map_t *, cow_map_node_t *);
static void transtbl_free(cow_map_t *);

static kstat_t *fssnap_highwater_kstat;

/* ************************************************************************ */

/* Device and Module Structures */

static struct cb_ops snap_cb_ops = {
        snap_open,
        snap_close,
        snap_strategy,
        snap_print,
        nodev,          /* no snap_dump */
        snap_read,
        nodev,          /* no snap_write */
        snap_ioctl,
        nodev,          /* no snap_devmap */
        nodev,          /* no snap_mmap   */
        nodev,          /* no snap_segmap */
        nochpoll,
        snap_prop_op,
        NULL,           /* streamtab */
        D_64BIT | D_NEW | D_MP, /* driver compatibility */
        CB_REV,
        nodev,          /* async I/O read entry point */
        nodev           /* async I/O write entry point */
};

static struct dev_ops snap_ops = {
        DEVO_REV,
        0,                      /* ref count */
        snap_getinfo,
        nulldev,                /* snap_identify obsolete */
        nulldev,                /* no snap_probe */
        snap_attach,
        snap_detach,
        nodev,                  /* no snap_reset */
        &snap_cb_ops,
        (struct bus_ops *)NULL,
        nulldev,                /* no snap_power() */
        ddi_quiesce_not_needed,         /* quiesce */
};

extern struct mod_ops mod_driverops;

static struct modldrv md = {
        &mod_driverops, /* Type of module. This is a driver */
        "snapshot driver",      /* Name of the module */
        &snap_ops,
};

static struct modlinkage ml = {
        MODREV_1,
        &md,
        NULL
};

static void *statep;

int
_init(void)
{
        int     error;
        kstat_t *ksp;
        kstat_named_t   *ksdata;

        error = ddi_soft_state_init(&statep, sizeof (struct snapshot_id *), 1);
        if (error) {
                cmn_err(CE_WARN, "_init: failed to init ddi_soft_state.");
                return (error);
        }

        error = mod_install(&ml);

        if (error) {
                cmn_err(CE_WARN, "_init: failed to mod_install.");
                ddi_soft_state_fini(&statep);
                return (error);
        }

        /*
         * Fill in the snapshot operations vector for file systems
         * (defined in fssnap_if.c)
         */

        snapops.fssnap_create = fssnap_create_impl;
        snapops.fssnap_set_candidate = fssnap_set_candidate_impl;
        snapops.fssnap_is_candidate = fssnap_is_candidate_impl;
        snapops.fssnap_create_done = fssnap_create_done_impl;
        snapops.fssnap_delete = fssnap_delete_impl;
        snapops.fssnap_strategy = fssnap_strategy_impl;

        mutex_init(&snapshot_mutex, NULL, MUTEX_DEFAULT, NULL);

        /*
         * Initialize the fssnap highwater kstat
         */
        ksp = kstat_create(snapname, 0, FSSNAP_KSTAT_HIGHWATER, "misc",
            KSTAT_TYPE_NAMED, 1, 0);
        if (ksp != NULL) {
                ksdata = (kstat_named_t *)ksp->ks_data;
                kstat_named_init(ksdata, FSSNAP_KSTAT_HIGHWATER,
                    KSTAT_DATA_UINT32);
                ksdata->value.ui32 = 0;
                kstat_install(ksp);
        } else {
                cmn_err(CE_WARN, "_init: failed to create highwater kstat.");
        }
        fssnap_highwater_kstat = ksp;

        return (0);
}

int
_info(struct modinfo *modinfop)
{
        return (mod_info(&ml, modinfop));
}

int
_fini(void)
{
        int     error;

        error = mod_remove(&ml);
        if (error)
                return (error);
        ddi_soft_state_fini(&statep);

        /*
         * delete the fssnap highwater kstat
         */
        kstat_delete(fssnap_highwater_kstat);

        mutex_destroy(&snapshot_mutex);

        /* Clear out the file system operations vector */
        snapops.fssnap_create = NULL;
        snapops.fssnap_set_candidate = NULL;
        snapops.fssnap_create_done = NULL;
        snapops.fssnap_delete = NULL;
        snapops.fssnap_strategy = NULL;

        return (0);
}

/* ************************************************************************ */

/*
 * Snapshot Driver Routines
 *
 * This section implements the snapshot character and block drivers.  The
 * device will appear to be a consistent read-only file system to
 * applications that wish to back it up or mount it.  The snapshot driver
 * communicates with the file system through the translation table, which
 * tells the snapshot driver where to find the data necessary to piece
 * together the frozen file system.  The data may either be on the master
 * device (no translation exists), in memory (a translation exists but has
 * not been flushed to the backing store), or in the backing store file.
 * The read request may require the snapshot driver to retrieve data from
 * several different places and piece it together to look like a single
 * contiguous read.
 *
 * The device minor number corresponds to the snapshot number in the list of
 * snapshot identifiers.  The soft state for each minor number is simply a
 * pointer to the snapshot id, which holds all of the snapshot state.  One
 * minor number is designated as the control device.  All snapshot create
 * and delete requests go through the control device to ensure this module
 * is properly loaded and attached before the file system starts calling
 * routines defined here.
 */


/*
 * snap_getinfo() - snapshot driver getinfo(9E) routine
 *
 */
/*ARGSUSED*/
static int
snap_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
{
        switch (infocmd) {
        case DDI_INFO_DEVT2DEVINFO:
                *result = fssnap_dip;
                return (DDI_SUCCESS);
        case DDI_INFO_DEVT2INSTANCE:
                *result = 0;    /* we only have one instance */
                return (DDI_SUCCESS);
        }
        return (DDI_FAILURE);
}

/*
 * snap_attach() - snapshot driver attach(9E) routine
 *
 *    sets up snapshot control device and control state.  The control state
 *    is a pointer to an "anonymous" snapshot_id for tracking opens and closes
 */
static int
snap_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
        int                     error;

        switch (cmd) {
        case DDI_ATTACH:
                /* create the control device */
                error = ddi_create_priv_minor_node(dip, SNAP_CTL_NODE, S_IFCHR,
                    SNAP_CTL_MINOR, DDI_PSEUDO, PRIVONLY_DEV,
                    PRIV_SYS_CONFIG, PRIV_SYS_CONFIG, 0666);
                if (error == DDI_FAILURE) {
                        return (DDI_FAILURE);
                }

                rw_init(&snap_ctl.sid_rwlock, NULL, RW_DEFAULT, NULL);
                rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
                fssnap_dip = dip;
                snap_ctl.sid_snapnumber = SNAP_CTL_MINOR;
                /* the control sid is not linked into the snapshot list */
                snap_ctl.sid_next = NULL;
                snap_ctl.sid_cowinfo = NULL;
                snap_ctl.sid_flags = 0;
                rw_exit(&snap_ctl.sid_rwlock);
                ddi_report_dev(dip);

                return (DDI_SUCCESS);
        case DDI_PM_RESUME:
                return (DDI_SUCCESS);

        case DDI_RESUME:
                return (DDI_SUCCESS);

        default:
                return (DDI_FAILURE);
        }
}

/*
 * snap_detach() - snapshot driver detach(9E) routine
 *
 *    destroys snapshot control device and control state.  If any snapshots
 *    are active (ie. num_snapshots != 0), the device will refuse to detach.
 */
static int
snap_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
{
        struct snapshot_id *sidp, *sidnextp;

        switch (cmd) {
        case DDI_DETACH:
                /* do not detach if the device is active */
                mutex_enter(&snapshot_mutex);
                if ((num_snapshots != 0) ||
                    ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0)) {
                        mutex_exit(&snapshot_mutex);
                        return (DDI_FAILURE);
                }

                /* free up the snapshot list */
                for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
                        ASSERT(SID_AVAILABLE(sidp) &&
                            !RW_LOCK_HELD(&sidp->sid_rwlock));
                        sidnextp = sidp->sid_next;
                        rw_destroy(&sidp->sid_rwlock);
                        kmem_free(sidp, sizeof (struct snapshot_id));
                }
                snapshot = NULL;

                /* delete the control device */
                ddi_remove_minor_node(dip, SNAP_CTL_NODE);
                fssnap_dip = NULL;

                ASSERT((snap_ctl.sid_flags & SID_CHAR_BUSY) == 0);
                rw_destroy(&snap_ctl.sid_rwlock);
                mutex_exit(&snapshot_mutex);

                return (DDI_SUCCESS);

        default:
                return (DDI_FAILURE);
        }
}

/*
 * snap_open() - snapshot driver open(9E) routine
 *
 *     marks the snapshot id as busy so it will not be recycled when deleted
 *     until the snapshot is closed.
 */
/* ARGSUSED */
static int
snap_open(dev_t *devp, int flag, int otyp, cred_t *cred)
{
        minor_t minor;
        struct snapshot_id **sidpp, *sidp;

        /* snapshots are read-only */
        if (flag & FWRITE)
                return (EROFS);

        minor = getminor(*devp);

        if (minor == SNAP_CTL_MINOR) {
                /* control device must be opened exclusively */
                if (((flag & FEXCL) != FEXCL) || (otyp != OTYP_CHR))
                        return (EINVAL);

                rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
                if ((snap_ctl.sid_flags & SID_CHAR_BUSY) != 0) {
                        rw_exit(&snap_ctl.sid_rwlock);
                        return (EBUSY);
                }

                snap_ctl.sid_flags |= SID_CHAR_BUSY;
                rw_exit(&snap_ctl.sid_rwlock);

                return (0);
        }

        sidpp = ddi_get_soft_state(statep, minor);
        if (sidpp == NULL || *sidpp == NULL)
                return (ENXIO);
        sidp = *sidpp;
        rw_enter(&sidp->sid_rwlock, RW_WRITER);

        if ((flag & FEXCL) && SID_BUSY(sidp)) {
                rw_exit(&sidp->sid_rwlock);
                return (EAGAIN);
        }

        ASSERT(sidpp != NULL && sidp != NULL);
        /* check to see if this snapshot has been killed on us */
        if (SID_INACTIVE(sidp)) {
                cmn_err(CE_WARN, "snap_open: snapshot %d does not exist.",
                    minor);
                rw_exit(&sidp->sid_rwlock);
                return (ENXIO);
        }

        switch (otyp) {
        case OTYP_CHR:
                sidp->sid_flags |= SID_CHAR_BUSY;
                break;
        case OTYP_BLK:
                sidp->sid_flags |= SID_BLOCK_BUSY;
                break;
        default:
                rw_exit(&sidp->sid_rwlock);
                return (EINVAL);
        }

        rw_exit(&sidp->sid_rwlock);

        /*
         * at this point if a valid snapshot was found then it has
         * been marked busy and we can use it.
         */
        return (0);
}

/*
 * snap_close() - snapshot driver close(9E) routine
 *
 *    unsets the busy bits in the snapshot id.  If the snapshot has been
 *    deleted while the snapshot device was open, the close call will clean
 *    up the remaining state information.
 */
/* ARGSUSED */
static int
snap_close(dev_t dev, int flag, int otyp, cred_t *cred)
{
        struct snapshot_id      **sidpp, *sidp;
        minor_t                 minor;
        char                    name[20];

        minor = getminor(dev);

        /* if this is the control device, close it and return */
        if (minor == SNAP_CTL_MINOR) {
                rw_enter(&snap_ctl.sid_rwlock, RW_WRITER);
                snap_ctl.sid_flags &= ~(SID_CHAR_BUSY);
                rw_exit(&snap_ctl.sid_rwlock);
                return (0);
        }

        sidpp = ddi_get_soft_state(statep, minor);
        if (sidpp == NULL || *sidpp == NULL) {
                cmn_err(CE_WARN, "snap_close: could not find state for "
                    "snapshot %d.", minor);
                return (ENXIO);
        }
        sidp = *sidpp;
        mutex_enter(&snapshot_mutex);
        rw_enter(&sidp->sid_rwlock, RW_WRITER);

        /* Mark the snapshot as not being busy anymore */
        switch (otyp) {
        case OTYP_CHR:
                sidp->sid_flags &= ~(SID_CHAR_BUSY);
                break;
        case OTYP_BLK:
                sidp->sid_flags &= ~(SID_BLOCK_BUSY);
                break;
        default:
                mutex_exit(&snapshot_mutex);
                rw_exit(&sidp->sid_rwlock);
                return (EINVAL);
        }

        if (SID_AVAILABLE(sidp)) {
                /*
                 * if this is the last close on a snapshot that has been
                 * deleted, then free up the soft state.  The snapdelete
                 * ioctl does not free this when the device is in use so
                 * we do it here after the last reference goes away.
                 */

                /* remove the device nodes */
                ASSERT(fssnap_dip != NULL);
                (void) snprintf(name, sizeof (name), "%d",
                    sidp->sid_snapnumber);
                ddi_remove_minor_node(fssnap_dip, name);
                (void) snprintf(name, sizeof (name), "%d,raw",
                    sidp->sid_snapnumber);
                ddi_remove_minor_node(fssnap_dip, name);

                /* delete the state structure */
                ddi_soft_state_free(statep, sidp->sid_snapnumber);
                num_snapshots--;
        }

        mutex_exit(&snapshot_mutex);
        rw_exit(&sidp->sid_rwlock);

        return (0);
}

/*
 * snap_read() - snapshot driver read(9E) routine
 *
 *    reads data from the snapshot by calling snap_strategy() through physio()
 */
/* ARGSUSED */
static int
snap_read(dev_t dev, struct uio *uiop, cred_t *credp)
{
        minor_t         minor;
        struct snapshot_id **sidpp;

        minor = getminor(dev);
        sidpp = ddi_get_soft_state(statep, minor);
        if (sidpp == NULL || *sidpp == NULL) {
                cmn_err(CE_WARN,
                    "snap_read: could not find state for snapshot %d.", minor);
                return (ENXIO);
        }
        return (physio(snap_strategy, NULL, dev, B_READ, minphys, uiop));
}

/*
 * snap_strategy() - snapshot driver strategy(9E) routine
 *
 *    cycles through each chunk in the requested buffer and calls
 *    snap_getchunk() on each chunk to retrieve it from the appropriate
 *    place.  Once all of the parts are put together the requested buffer
 *    is returned.  The snapshot driver is read-only, so a write is invalid.
 */
static int
snap_strategy(struct buf *bp)
{
        struct snapshot_id **sidpp, *sidp;
        minor_t         minor;
        chunknumber_t   chunk;
        int             off, len;
        u_longlong_t    reqptr;
        int             error = 0;
        size_t          chunksz;
        caddr_t         buf;

        /* snapshot device is read-only */
        if (bp->b_flags & B_WRITE) {
                bioerror(bp, EROFS);
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                return (0);
        }

        minor = getminor(bp->b_edev);
        sidpp = ddi_get_soft_state(statep, minor);
        if (sidpp == NULL || *sidpp == NULL) {
                cmn_err(CE_WARN,
                    "snap_strategy: could not find state for snapshot %d.",
                    minor);
                bioerror(bp, ENXIO);
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                return (0);
        }
        sidp = *sidpp;
        ASSERT(sidp);
        rw_enter(&sidp->sid_rwlock, RW_READER);

        if (SID_INACTIVE(sidp)) {
                bioerror(bp, ENXIO);
                bp->b_resid = bp->b_bcount;
                biodone(bp);
                rw_exit(&sidp->sid_rwlock);
                return (0);
        }

        if (bp->b_flags & (B_PAGEIO|B_PHYS))
                bp_mapin(bp);

        bp->b_resid = bp->b_bcount;
        ASSERT(bp->b_un.b_addr);
        buf = bp->b_un.b_addr;

        chunksz = sidp->sid_cowinfo->cow_map.cmap_chunksz;

        /* reqptr is the current DEV_BSIZE offset into the device */
        /* chunk is the chunk containing reqptr */
        /* len is the length of the request (in the current chunk) in bytes */
        /* off is the byte offset into the current chunk */
        reqptr = bp->b_lblkno;
        while (bp->b_resid > 0) {
                chunk = dbtocowchunk(&sidp->sid_cowinfo->cow_map, reqptr);
                off = (reqptr % (chunksz >> DEV_BSHIFT)) << DEV_BSHIFT;
                len = min(chunksz - off, bp->b_resid);
                ASSERT((off + len) <= chunksz);

                if ((error = snap_getchunk(sidp, chunk, off, len, buf)) != 0) {
                        /*
                         * EINVAL means the user tried to go out of range.
                         * Anything else means it's likely that we're
                         * confused.
                         */
                        if (error != EINVAL) {
                                cmn_err(CE_WARN, "snap_strategy: error "
                                    "calling snap_getchunk, chunk = %llu, "
                                    "offset = %d, len = %d, resid = %lu, "
                                    "error = %d.",
                                    chunk, off, len, bp->b_resid, error);
                        }
                        bioerror(bp, error);
                        biodone(bp);
                        rw_exit(&sidp->sid_rwlock);
                        return (0);
                }
                bp->b_resid -= len;
                reqptr += (len >> DEV_BSHIFT);
                buf += len;
        }

        ASSERT(bp->b_resid == 0);
        biodone(bp);

        rw_exit(&sidp->sid_rwlock);
        return (0);
}

/*
 * snap_getchunk() - helper function for snap_strategy()
 *
 *    gets the requested data from the appropriate place and fills in the
 *    buffer.  chunk is the chunk number of the request, offset is the
 *    offset into that chunk and must be less than the chunk size.  len is
 *    the length of the request starting at offset, and must not exceed a
 *    chunk boundary.  buffer is the address to copy the data to.  len
 *    bytes are copied into the buffer starting at the location specified.
 *
 *    A chunk is located according to the following algorithm:
 *        - If the chunk does not have a translation or is not a candidate
 *          for translation, it is read straight from the master device.
 *        - If the chunk does have a translation, then it is either on
 *          disk or in memory:
 *            o If it is in memory the requested data is simply copied out
 *              of the in-memory buffer.
 *            o If it is in the backing store, it is read from there.
 *
 *    This function does the real work of the snapshot driver.
 */
static int
snap_getchunk(struct snapshot_id *sidp, chunknumber_t chunk, int offset,
    int len, char *buffer)
{
        cow_map_t       *cmap = &sidp->sid_cowinfo->cow_map;
        cow_map_node_t  *cmn;
        struct buf      *snapbuf;
        int             error = 0;
        char            *newbuffer;
        int             newlen = 0;
        int             partial = 0;

        ASSERT(RW_READ_HELD(&sidp->sid_rwlock));
        ASSERT(offset + len <= cmap->cmap_chunksz);

        /*
         * Check if the chunk number is out of range and if so bail out
         */
        if (chunk >= (cmap->cmap_bmsize * NBBY)) {
                return (EINVAL);
        }

        /*
         * If the chunk is not a candidate for translation, then the chunk
         * was not allocated when the snapshot was taken.  Since it does
         * not contain data associated with this snapshot, just return a
         * zero buffer instead.
         */
        if (isclr(cmap->cmap_candidate, chunk)) {
                bzero(buffer, len);
                return (0);
        }

        /*
         * if the chunk is a candidate for translation but a
         * translation does not exist, then read through to the
         * original file system.  The rwlock is held until the read
         * completes if it hasn't been translated to make sure the
         * file system does not translate the block before we
         * access it. If it has already been translated we don't
         * need the lock, because the translation will never go away.
         */
        rw_enter(&cmap->cmap_rwlock, RW_READER);
        if (isclr(cmap->cmap_hastrans, chunk)) {
                snapbuf = getrbuf(KM_SLEEP);
                /*
                 * Reading into the buffer saves having to do a copy,
                 * but gets tricky if the request size is not a
                 * multiple of DEV_BSIZE.  However, we are filling the
                 * buffer left to right, so future reads will write
                 * over any extra data we might have read.
                 */

                partial = len % DEV_BSIZE;

                snapbuf->b_bcount = len;
                snapbuf->b_lblkno = lbtodb(chunk * cmap->cmap_chunksz + offset);
                snapbuf->b_un.b_addr = buffer;

                snapbuf->b_iodone = NULL;
                snapbuf->b_proc = NULL;         /* i.e. the kernel */
                snapbuf->b_flags = B_READ | B_BUSY;
                snapbuf->b_edev = sidp->sid_fvp->v_vfsp->vfs_dev;

                if (partial) {
                        /*
                         * Partial block read in progress.
                         * This is bad as modules further down the line
                         * assume buf's are exact multiples of DEV_BSIZE
                         * and we end up with fewer, or zero, bytes read.
                         * To get round this we need to round up to the
                         * nearest full block read and then return only
                         * len bytes.
                         */
                        newlen = (len - partial) + DEV_BSIZE;
                        newbuffer = kmem_alloc(newlen, KM_SLEEP);

                        snapbuf->b_bcount = newlen;
                        snapbuf->b_un.b_addr = newbuffer;
                }

                (void) bdev_strategy(snapbuf);
                (void) biowait(snapbuf);

                error = geterror(snapbuf);

                if (partial) {
                        /*
                         * Partial block read. Now we need to bcopy the
                         * correct number of bytes back into the
                         * supplied buffer, and tidy up our temp
                         * buffer.
                         */
                        bcopy(newbuffer, buffer, len);
                        kmem_free(newbuffer, newlen);
                }

                freerbuf(snapbuf);
                rw_exit(&cmap->cmap_rwlock);

                return (error);
        }

        /*
         * finally, if the chunk is a candidate for translation and it
         * has been translated, then we clone the chunk of the buffer
         * that was copied aside by the file system.
         * The cmap_rwlock does not need to be held after we know the
         * data has already been copied. Once a chunk has been copied
         * to the backing file, it is stable read only data.
         */
        cmn = transtbl_get(cmap, chunk);

        /* check whether the data is in memory or in the backing file */
        if (cmn != NULL) {
                ASSERT(cmn->cmn_buf);
                /* already in memory */
                bcopy(cmn->cmn_buf + offset, buffer, len);
                rw_exit(&cmap->cmap_rwlock);
        } else {
                ssize_t resid = len;
                int     bf_index;
                /*
                 * can cause deadlock with writer if we don't drop the
                 * cmap_rwlock before trying to get the backing store file
                 * vnode rwlock.
                 */
                rw_exit(&cmap->cmap_rwlock);

                bf_index = chunk / cmap->cmap_chunksperbf;

                /* read buffer from backing file */
                error = vn_rdwr(UIO_READ,
                    (sidp->sid_cowinfo->cow_backfile_array)[bf_index],
                    buffer, len, ((chunk % cmap->cmap_chunksperbf) *
                    cmap->cmap_chunksz) + offset, UIO_SYSSPACE, 0,
                    RLIM64_INFINITY, kcred, &resid);
        }

        return (error);
}

/*
 * snap_print() - snapshot driver print(9E) routine
 *
 *    prints the device identification string.
 */
static int
snap_print(dev_t dev, char *str)
{
        struct snapshot_id **sidpp;
        minor_t         minor;

        minor = getminor(dev);
        sidpp = ddi_get_soft_state(statep, minor);
        if (sidpp == NULL || *sidpp == NULL) {
                cmn_err(CE_WARN,
                    "snap_print: could not find state for snapshot %d.", minor);
                return (ENXIO);
        }

        cmn_err(CE_NOTE, "snap_print: snapshot %d: %s",  minor, str);

        return (0);
}

/*
 * snap_prop_op() - snapshot driver prop_op(9E) routine
 *
 *    get 32-bit and 64-bit values for size (character driver) and nblocks
 *    (block driver).
 */
static int
snap_prop_op(dev_t dev, dev_info_t *dip, ddi_prop_op_t prop_op,
    int flags, char *name, caddr_t valuep, int *lengthp)
{
        int             minor;
        struct snapshot_id **sidpp;
        dev_t           mdev;
        dev_info_t      *mdip;
        int             error;

        minor = getminor(dev);

        /*
         * If this is the control device just check for .conf properties,
         * if the wildcard DDI_DEV_T_ANY was passed in via the dev_t
         * just fall back to the defaults.
         */
        if ((minor == SNAP_CTL_MINOR) || (dev == DDI_DEV_T_ANY))
                return (ddi_prop_op(dev, dip, prop_op, flags, name,
                    valuep, lengthp));

        /* check to see if there is a master device plumbed */
        sidpp = ddi_get_soft_state(statep, minor);
        if (sidpp == NULL || *sidpp == NULL) {
                cmn_err(CE_WARN,
                    "snap_prop_op: could not find state for "
                    "snapshot %d.", minor);
                return (DDI_PROP_NOT_FOUND);
        }

        if (((*sidpp)->sid_fvp == NULL) || ((*sidpp)->sid_fvp->v_vfsp == NULL))
                return (ddi_prop_op(dev, dip, prop_op, flags, name,
                    valuep, lengthp));

        /* hold master device and pass operation down */
        mdev = (*sidpp)->sid_fvp->v_vfsp->vfs_dev;
        if (mdip = e_ddi_hold_devi_by_dev(mdev, 0)) {

                /* get size information from the master device. */
                error = cdev_prop_op(mdev, mdip,
                    prop_op, flags, name, valuep, lengthp);
                ddi_release_devi(mdip);
                if (error == DDI_PROP_SUCCESS)
                        return (error);
        }

        /* master device did not service the request, try framework */
        return (ddi_prop_op(dev, dip, prop_op, flags, name, valuep, lengthp));

}

/*
 * snap_ioctl() - snapshot driver ioctl(9E) routine
 *
 *    only applies to the control device.  The control device accepts two
 *    ioctl requests: create a snapshot or delete a snapshot.  In either
 *    case, the vnode for the requested file system is extracted, and the
 *    request is passed on to the file system via the same ioctl.  The file
 *    system is responsible for doing the things necessary for creating or
 *    destroying a snapshot, including any file system specific operations
 *    that must be performed as well as setting up and deleting the snapshot
 *    state through the fssnap interfaces.
 */
static int
snap_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
int *rvalp)
{
        minor_t minor;
        int error = 0;

        minor = getminor(dev);

        if (minor != SNAP_CTL_MINOR) {
                return (EINVAL);
        }

        switch (cmd) {
        case _FIOSNAPSHOTCREATE:
        {
                struct fiosnapcreate    fc;
                struct file             *fp;
                struct vnode            *vp;

                if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
                        return (EFAULT);

                /* get vnode for file system mount point */
                if ((fp = getf(fc.rootfiledesc)) == NULL)
                        return (EBADF);

                ASSERT(fp->f_vnode);
                vp = fp->f_vnode;
                VN_HOLD(vp);
                releasef(fc.rootfiledesc);

                /* pass ioctl request to file system */
                error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
                VN_RELE(vp);
                break;
        }
        case _FIOSNAPSHOTCREATE_MULTI:
        {
                struct fiosnapcreate_multi      fc;
                struct file             *fp;
                struct vnode            *vp;

                if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
                        return (EFAULT);

                /* get vnode for file system mount point */
                if ((fp = getf(fc.rootfiledesc)) == NULL)
                        return (EBADF);

                ASSERT(fp->f_vnode);
                vp = fp->f_vnode;
                VN_HOLD(vp);
                releasef(fc.rootfiledesc);

                /* pass ioctl request to file system */
                error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
                VN_RELE(vp);
                break;
        }
        case _FIOSNAPSHOTDELETE:
        {
                major_t                 major;
                struct fiosnapdelete    fc;
                snapshot_id_t           *sidp = NULL;
                snapshot_id_t           *sidnextp = NULL;
                struct file             *fp = NULL;
                struct vnode            *vp = NULL;
                struct vfs              *vfsp = NULL;
                vfsops_t                *vfsops = EIO_vfsops;

                if (ddi_copyin((void *)arg, &fc, sizeof (fc), mode))
                        return (EFAULT);

                /* get vnode for file system mount point */
                if ((fp = getf(fc.rootfiledesc)) == NULL)
                        return (EBADF);

                ASSERT(fp->f_vnode);
                vp = fp->f_vnode;
                VN_HOLD(vp);
                releasef(fc.rootfiledesc);
                /*
                 * Test for two formats of delete and set correct minor/vp:
                 * pseudo device:
                 * fssnap -d [/dev/fssnap/x]
                 * or
                 * mount point:
                 * fssnap -d [/mntpt]
                 * Note that minor is verified to be equal to SNAP_CTL_MINOR
                 * at this point which is an invalid minor number.
                 */
                ASSERT(fssnap_dip != NULL);
                major = ddi_driver_major(fssnap_dip);
                mutex_enter(&snapshot_mutex);
                for (sidp = snapshot; sidp != NULL; sidp = sidnextp) {
                        rw_enter(&sidp->sid_rwlock, RW_READER);
                        sidnextp = sidp->sid_next;
                        /* pseudo device: */
                        if (major == getmajor(vp->v_rdev)) {
                                minor = getminor(vp->v_rdev);
                                if (sidp->sid_snapnumber == (uint_t)minor &&
                                    sidp->sid_fvp) {
                                        VN_RELE(vp);
                                        vp = sidp->sid_fvp;
                                        VN_HOLD(vp);
                                        rw_exit(&sidp->sid_rwlock);
                                        break;
                                }
                        /* Mount point: */
                        } else {
                                if (sidp->sid_fvp == vp) {
                                        minor = sidp->sid_snapnumber;
                                        rw_exit(&sidp->sid_rwlock);
                                        break;
                                }
                        }
                        rw_exit(&sidp->sid_rwlock);
                }
                mutex_exit(&snapshot_mutex);
                /* Verify minor got set correctly above */
                if (minor == SNAP_CTL_MINOR) {
                        VN_RELE(vp);
                        return (EINVAL);
                }
                dev = makedevice(major, minor);
                /*
                 * Create dummy vfs entry
                 * to use as a locking semaphore across the IOCTL
                 * for mount in progress cases...
                 */
                vfsp = vfs_alloc(KM_SLEEP);
                VFS_INIT(vfsp, vfsops, NULL);
                VFS_HOLD(vfsp);
                vfs_addmip(dev, vfsp);
                if ((vfs_devmounting(dev, vfsp)) ||
                    (vfs_devismounted(dev))) {
                        vfs_delmip(vfsp);
                        VFS_RELE(vfsp);
                        VN_RELE(vp);
                        return (EBUSY);
                }
                /*
                 * Nobody mounted but do not release mount in progress lock
                 * until IOCTL complete to prohibit a mount sneaking
                 * in
                 */
                error = VOP_IOCTL(vp, cmd, arg, 0, credp, rvalp, NULL);
                vfs_delmip(vfsp);
                VFS_RELE(vfsp);
                VN_RELE(vp);
                break;
        }
        default:
                cmn_err(CE_WARN, "snap_ioctl: Invalid ioctl cmd %d, minor %d.",
                    cmd, minor);
                return (EINVAL);
        }

        return (error);
}


/* ************************************************************************ */

/*
 * Translation Table Routines
 *
 *    These support routines implement a simple doubly linked list
 *    to keep track of chunks that are currently in memory.  The maximum
 *    size of the list is determined by the fssnap_max_mem_chunks variable.
 *    The cmap_rwlock is used to protect the linkage of the list.
 */

/*
 * transtbl_add() - add a node to the translation table
 *
 *    allocates a new node and points it at the buffer passed in.  The node
 *    is added to the beginning of the doubly linked list and the head of
 *    the list is moved.  The cmap_rwlock must be held as a writer through
 *    this operation.
 */
static cow_map_node_t *
transtbl_add(cow_map_t *cmap, chunknumber_t chunk, caddr_t buf)
{
        cow_map_node_t  *cmnode;

        ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));

        cmnode = kmem_alloc(sizeof (cow_map_node_t), KM_SLEEP);

        /*
         * insert new translations at the beginning so cmn_table is always
         * the first node.
         */
        cmnode->cmn_chunk = chunk;
        cmnode->cmn_buf = buf;
        cmnode->cmn_prev = NULL;
        cmnode->cmn_next = cmap->cmap_table;
        if (cmnode->cmn_next)
                cmnode->cmn_next->cmn_prev = cmnode;
        cmap->cmap_table = cmnode;

        return (cmnode);
}

/*
 * transtbl_get() - look up a node in the translation table
 *
 *    called by the snapshot driver to find data that has been translated.
 *    The lookup is done by the chunk number, and the node is returned.
 *    If the node was not found, NULL is returned.
 */
static cow_map_node_t *
transtbl_get(cow_map_t *cmap, chunknumber_t chunk)
{
        cow_map_node_t *cmn;

        ASSERT(RW_READ_HELD(&cmap->cmap_rwlock));
        ASSERT(cmap);

        /* search the translation table */
        for (cmn = cmap->cmap_table; cmn != NULL; cmn = cmn->cmn_next) {
                if (cmn->cmn_chunk == chunk)
                        return (cmn);
        }

        /* not found */
        return (NULL);
}

/*
 * transtbl_delete() - delete a node from the translation table
 *
 *    called when a node's data has been written out to disk.  The
 *    cmap_rwlock must be held as a writer for this operation.  If the node
 *    being deleted is the head of the list, then the head is moved to the
 *    next node.  Both the node's data and the node itself are freed.
 */
static void
transtbl_delete(cow_map_t *cmap, cow_map_node_t *cmn)
{
        ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));
        ASSERT(cmn);
        ASSERT(cmap->cmap_table);

        /* if the head of the list is being deleted, then move the head up */
        if (cmap->cmap_table == cmn) {
                ASSERT(cmn->cmn_prev == NULL);
                cmap->cmap_table = cmn->cmn_next;
        }


        /* make previous node's next pointer skip over current node */
        if (cmn->cmn_prev != NULL) {
                ASSERT(cmn->cmn_prev->cmn_next == cmn);
                cmn->cmn_prev->cmn_next = cmn->cmn_next;
        }

        /* make next node's previous pointer skip over current node */
        if (cmn->cmn_next != NULL) {
                ASSERT(cmn->cmn_next->cmn_prev == cmn);
                cmn->cmn_next->cmn_prev = cmn->cmn_prev;
        }

        /* free the data and the node */
        ASSERT(cmn->cmn_buf);
        kmem_free(cmn->cmn_buf, cmap->cmap_chunksz);
        kmem_free(cmn, sizeof (cow_map_node_t));
}

/*
 * transtbl_free() - free the entire translation table
 *
 *    called when the snapshot is deleted.  This frees all of the nodes in
 *    the translation table (but not the bitmaps).
 */
static void
transtbl_free(cow_map_t *cmap)
{
        cow_map_node_t  *curnode;
        cow_map_node_t  *tempnode;

        for (curnode = cmap->cmap_table; curnode != NULL; curnode = tempnode) {
                tempnode = curnode->cmn_next;

                kmem_free(curnode->cmn_buf, cmap->cmap_chunksz);
                kmem_free(curnode, sizeof (cow_map_node_t));
        }
}


/* ************************************************************************ */

/*
 * Interface Implementation Routines
 *
 * The following functions implement snapshot interface routines that are
 * called by the file system to create, delete, and use a snapshot.  The
 * interfaces are defined in fssnap_if.c and are filled in by this driver
 * when it is loaded.  This technique allows the file system to depend on
 * the interface module without having to load the full implementation and
 * snapshot device drivers.
 */

/*
 * fssnap_strategy_impl() - strategy routine called by the file system
 *
 *    called by the file system to handle copy-on-write when necessary.  All
 *    reads and writes that the file system performs should go through this
 *    function.  If the file system calls the underlying device's strategy
 *    routine without going through fssnap_strategy() (eg. by calling
 *    bdev_strategy()), the snapshot may not be consistent.
 *
 *    This function starts by doing significant sanity checking to insure
 *    the snapshot was not deleted out from under it or deleted and then
 *    recreated.  To do this, it checks the actual pointer passed into it
 *    (ie. the handle held by the file system).  NOTE that the parameter is
 *    a POINTER TO A POINTER to the snapshot id.  Once the snapshot id is
 *    locked, it knows things are ok and that this snapshot is really for
 *    this file system.
 *
 *    If the request is a write, fssnap_translate() is called to determine
 *    whether a copy-on-write is required.  If it is a read, the read is
 *    simply passed on to the underlying device.
 */
static void
fssnap_strategy_impl(void *snapshot_id, buf_t *bp)
{
        struct snapshot_id **sidpp;
        struct snapshot_id *sidp;
        int error;

        /* read requests are always passed through */
        if (bp->b_flags & B_READ) {
                (void) bdev_strategy(bp);
                return;
        }

        /*
         * Because we were not able to take the snapshot read lock BEFORE
         * checking for a snapshot back in the file system, things may have
         * drastically changed out from under us.  For instance, the snapshot
         * may have been deleted, deleted and recreated, or worse yet, deleted
         * for this file system but now the snapshot number is in use by another
         * file system.
         *
         * Having a pointer to the file system's snapshot id pointer allows us
         * to sanity check most of this, though it assumes the file system is
         * keeping track of a pointer to the snapshot_id somewhere.
         */
        sidpp = (struct snapshot_id **)snapshot_id;
        sidp = *sidpp;

        /*
         * if this file system's snapshot was disabled, just pass the
         * request through.
         */
        if (sidp == NULL) {
                (void) bdev_strategy(bp);
                return;
        }

        /*
         * Once we have the reader lock the snapshot will not magically go
         * away.  But things may have changed on us before this so double check.
         */
        rw_enter(&sidp->sid_rwlock, RW_READER);

        /*
         * if an error was founds somewhere the DELETE flag will be
         * set to indicate the snapshot should be deleted and no new
         * translations should occur.
         */
        if (sidp->sid_flags & SID_DELETE) {
                rw_exit(&sidp->sid_rwlock);
                (void) fssnap_delete_impl(sidpp);
                (void) bdev_strategy(bp);
                return;
        }

        /*
         * If the file system is no longer pointing to the snapshot we were
         * called with, then it should not attempt to translate this buffer as
         * it may be going to a snapshot for a different file system.
         * Even if the file system snapshot pointer is still the same, the
         * snapshot may have been disabled before we got the reader lock.
         */
        if (sidp != *sidpp || SID_INACTIVE(sidp)) {
                rw_exit(&sidp->sid_rwlock);
                (void) bdev_strategy(bp);
                return;
        }

        /*
         * At this point we're sure the snapshot will not go away while the
         * reader lock is held, and we are reasonably certain that we are
         * writing to the correct snapshot.
         */
        if ((error = fssnap_translate(sidpp, bp)) != 0) {
                /*
                 * fssnap_translate can release the reader lock if it
                 * has to wait for a semaphore.  In this case it is possible
                 * for the snapshot to be deleted in this time frame.  If this
                 * happens just sent the buf thru to the filesystems device.
                 */
                if (sidp != *sidpp || SID_INACTIVE(sidp)) {
                        rw_exit(&sidp->sid_rwlock);
                        (void) bdev_strategy(bp);
                        return;
                }
                bioerror(bp, error);
                biodone(bp);
        }
        rw_exit(&sidp->sid_rwlock);
}

/*
 * fssnap_translate() - helper function for fssnap_strategy()
 *
 *    performs the actual copy-on-write for write requests, if required.
 *    This function does the real work of the file system side of things.
 *
 *    It first checks the candidate bitmap to quickly determine whether any
 *    action is necessary.  If the candidate bitmap indicates the chunk was
 *    allocated when the snapshot was created, then it checks to see whether
 *    a translation already exists.  If a translation already exists then no
 *    action is required.  If the chunk is a candidate for copy-on-write,
 *    and a translation does not already exist, then the chunk is read in
 *    and a node is added to the translation table.
 *
 *    Once all of the chunks in the request range have been copied (if they
 *    needed to be), then the original request can be satisfied and the old
 *    data can be overwritten.
 */
static int
fssnap_translate(struct snapshot_id **sidpp, struct buf *wbp)
{
        snapshot_id_t   *sidp = *sidpp;
        struct buf      *oldbp; /* buffer to store old data in */
        struct cow_info *cowp = sidp->sid_cowinfo;
        cow_map_t       *cmap = &cowp->cow_map;
        cow_map_node_t  *cmn;
        chunknumber_t   cowchunk, startchunk, endchunk;
        int             error;
        int     throttle_write = 0;

        /* make sure the snapshot is active */
        ASSERT(RW_READ_HELD(&sidp->sid_rwlock));

        startchunk = dbtocowchunk(cmap, wbp->b_lblkno);
        endchunk   = dbtocowchunk(cmap, wbp->b_lblkno +
            ((wbp->b_bcount-1) >> DEV_BSHIFT));

        /*
         * Do not throttle the writes of the fssnap taskq thread and
         * the log roll (trans_roll) thread. Furthermore the writes to
         * the on-disk log are also not subject to throttling.
         * The fssnap_write_taskq thread's write can block on the throttling
         * semaphore which leads to self-deadlock as this same thread
         * releases the throttling semaphore after completing the IO.
         * If the trans_roll thread's write is throttled then we can deadlock
         * because the fssnap_taskq_thread which releases the throttling
         * semaphore can block waiting for log space which can only be
         * released by the trans_roll thread.
         */

        throttle_write = !(taskq_member(cowp->cow_taskq, curthread) ||
            tsd_get(bypass_snapshot_throttle_key));

        /*
         * Iterate through all chunks covered by this write and perform the
         * copy-aside if necessary.  Once all chunks have been safely
         * stowed away, the new data may be written in a single sweep.
         *
         * For each chunk in the range, the following sequence is performed:
         *      - Is the chunk a candidate for translation?
         *              o If not, then no translation is necessary, continue
         *      - If it is a candidate, then does it already have a translation?
         *              o If so, then no translation is necessary, continue
         *      - If it is a candidate, but does not yet have a translation,
         *        then read the old data and schedule an asynchronous taskq
         *        to write the old data to the backing file.
         *
         * Once this has been performed over the entire range of chunks, then
         * it is safe to overwrite the data that is there.
         *
         * Note that no lock is required to check the candidate bitmap because
         * it never changes once the snapshot is created.  The reader lock is
         * taken to check the hastrans bitmap since it may change.  If it
         * turns out a copy is required, then the lock is upgraded to a
         * writer, and the bitmap is re-checked as it may have changed while
         * the lock was released.  Finally, the write lock is held while
         * reading the old data to make sure it is not translated out from
         * under us.
         *
         * This locking mechanism should be sufficient to handle multiple
         * threads writing to overlapping chunks simultaneously.
         */
        for (cowchunk = startchunk; cowchunk <= endchunk; cowchunk++) {
                /*
                 * If the cowchunk is outside of the range of our
                 * candidate maps, then simply break out of the
                 * loop and pass the I/O through to bdev_strategy.
                 * This would occur if the file system has grown
                 * larger since the snapshot was taken.
                 */
                if (cowchunk >= (cmap->cmap_bmsize * NBBY))
                        break;

                /*
                 * If no disk blocks were allocated in this chunk when the
                 * snapshot was created then no copy-on-write will be
                 * required.  Since this bitmap is read-only no locks are
                 * necessary.
                 */
                if (isclr(cmap->cmap_candidate, cowchunk)) {
                        continue;
                }

                /*
                 * If a translation already exists, the data can be written
                 * through since the old data has already been saved off.
                 */
                if (isset(cmap->cmap_hastrans, cowchunk)) {
                        continue;
                }


                /*
                 * Throttle translations if there are too many outstanding
                 * chunks in memory.  The semaphore is sema_v'd by the taskq.
                 *
                 * You can't keep the sid_rwlock if you would go to sleep.
                 * This will result in deadlock when someone tries to delete
                 * the snapshot (wants the sid_rwlock as a writer, but can't
                 * get it).
                 */
                if (throttle_write) {
                        if (sema_tryp(&cmap->cmap_throttle_sem) == 0) {
                                rw_exit(&sidp->sid_rwlock);
                                atomic_inc_32(&cmap->cmap_waiters);
                                sema_p(&cmap->cmap_throttle_sem);
                                atomic_dec_32(&cmap->cmap_waiters);
                                rw_enter(&sidp->sid_rwlock, RW_READER);

                        /*
                         * Now since we released the sid_rwlock the state may
                         * have transitioned underneath us. so check that again.
                         */
                                if (sidp != *sidpp || SID_INACTIVE(sidp)) {
                                        sema_v(&cmap->cmap_throttle_sem);
                                        return (ENXIO);
                                }
                        }
                }

                /*
                 * Acquire the lock as a writer and check to see if a
                 * translation has been added in the meantime.
                 */
                rw_enter(&cmap->cmap_rwlock, RW_WRITER);
                if (isset(cmap->cmap_hastrans, cowchunk)) {
                        if (throttle_write)
                                sema_v(&cmap->cmap_throttle_sem);
                        rw_exit(&cmap->cmap_rwlock);
                        continue; /* go to the next chunk */
                }

                /*
                 * read a full chunk of data from the requested offset rounded
                 * down to the nearest chunk size.
                 */
                oldbp = getrbuf(KM_SLEEP);
                oldbp->b_lblkno = cowchunktodb(cmap, cowchunk);
                oldbp->b_edev = wbp->b_edev;
                oldbp->b_bcount = cmap->cmap_chunksz;
                oldbp->b_bufsize = cmap->cmap_chunksz;
                oldbp->b_iodone = NULL;
                oldbp->b_proc = NULL;
                oldbp->b_flags = B_READ;
                oldbp->b_un.b_addr = kmem_alloc(cmap->cmap_chunksz, KM_SLEEP);

                (void) bdev_strategy(oldbp);
                (void) biowait(oldbp);

                /*
                 * It's ok to bail in the middle of translating the range
                 * because the extra copy-asides will not hurt anything
                 * (except by using extra space in the backing store).
                 */
                if ((error = geterror(oldbp)) != 0) {
                        cmn_err(CE_WARN, "fssnap_translate: error reading "
                            "old data for snapshot %d, chunk %llu, disk block "
                            "%lld, size %lu, error %d.", sidp->sid_snapnumber,
                            cowchunk, oldbp->b_lblkno, oldbp->b_bcount, error);
                        kmem_free(oldbp->b_un.b_addr, cmap->cmap_chunksz);
                        freerbuf(oldbp);
                        rw_exit(&cmap->cmap_rwlock);
                        if (throttle_write)
                                sema_v(&cmap->cmap_throttle_sem);
                        return (error);
                }

                /*
                 * add the node to the translation table and save a reference
                 * to pass to the taskq for writing out to the backing file
                 */
                cmn = transtbl_add(cmap, cowchunk, oldbp->b_un.b_addr);
                freerbuf(oldbp);

                /*
                 * Add a reference to the snapshot id so the lower level
                 * processing (ie. the taskq) can get back to the state
                 * information.
                 */
                cmn->cmn_sid = sidp;
                cmn->release_sem = throttle_write;
                setbit(cmap->cmap_hastrans, cowchunk);

                rw_exit(&cmap->cmap_rwlock);

                /*
                 * schedule the asynchronous write to the backing file
                 */
                if (cowp->cow_backfile_array != NULL)
                        (void) taskq_dispatch(cowp->cow_taskq,
                            fssnap_write_taskq, cmn, TQ_SLEEP);
        }

        /*
         * Write new data in place of the old data.  At this point all of the
         * chunks touched by this write have been copied aside and so the new
         * data can be written out all at once.
         */
        (void) bdev_strategy(wbp);

        return (0);
}

/*
 * fssnap_write_taskq() - write in-memory translations to the backing file
 *
 *    writes in-memory translations to the backing file asynchronously.  A
 *    task is dispatched each time a new translation is created.  The task
 *    writes the data to the backing file and removes it from the memory
 *    list. The throttling semaphore is released only if the particular
 *    translation was throttled in fssnap_translate.
 */
static void
fssnap_write_taskq(void *arg)
{
        cow_map_node_t  *cmn = (cow_map_node_t *)arg;
        snapshot_id_t   *sidp = cmn->cmn_sid;
        cow_info_t      *cowp = sidp->sid_cowinfo;
        cow_map_t       *cmap = &cowp->cow_map;
        int             error;
        int             bf_index;
        int             release_sem = cmn->release_sem;

        /*
         * The sid_rwlock does not need to be held here because the taskqs
         * are destroyed explicitly by fssnap_delete (with the sid_rwlock
         * held as a writer).  taskq_destroy() will flush all of the tasks
         * out before fssnap_delete frees up all of the structures.
         */

        /* if the snapshot was disabled from under us, drop the request. */
        rw_enter(&sidp->sid_rwlock, RW_READER);
        if (SID_INACTIVE(sidp)) {
                rw_exit(&sidp->sid_rwlock);
                if (release_sem)
                        sema_v(&cmap->cmap_throttle_sem);
                return;
        }
        rw_exit(&sidp->sid_rwlock);

        atomic_inc_64((uint64_t *)&cmap->cmap_nchunks);

        if ((cmap->cmap_maxsize != 0) &&
            ((cmap->cmap_nchunks * cmap->cmap_chunksz) > cmap->cmap_maxsize)) {
                cmn_err(CE_WARN, "fssnap_write_taskq: snapshot %d (%s) has "
                    "reached the maximum backing file size specified (%llu "
                    "bytes) and will be deleted.", sidp->sid_snapnumber,
                    (char *)cowp->cow_kstat_mntpt->ks_data,
                    cmap->cmap_maxsize);
                if (release_sem)
                        sema_v(&cmap->cmap_throttle_sem);
                atomic_or_uint(&sidp->sid_flags, SID_DELETE);
                return;
        }

        /* perform the write */
        bf_index = cmn->cmn_chunk / cmap->cmap_chunksperbf;

        if (error = vn_rdwr(UIO_WRITE, (cowp->cow_backfile_array)[bf_index],
            cmn->cmn_buf, cmap->cmap_chunksz,
            (cmn->cmn_chunk % cmap->cmap_chunksperbf) * cmap->cmap_chunksz,
            UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, (ssize_t *)NULL)) {
                cmn_err(CE_WARN, "fssnap_write_taskq: error writing to "
                    "backing file.  DELETING SNAPSHOT %d, backing file path "
                    "%s, offset %llu bytes, error %d.", sidp->sid_snapnumber,
                    (char *)cowp->cow_kstat_bfname->ks_data,
                    cmn->cmn_chunk * cmap->cmap_chunksz, error);
                if (release_sem)
                        sema_v(&cmap->cmap_throttle_sem);
                atomic_or_uint(&sidp->sid_flags, SID_DELETE);
                return;
        }

        /*
         * now remove the node and buffer from memory
         */
        rw_enter(&cmap->cmap_rwlock, RW_WRITER);
        transtbl_delete(cmap, cmn);
        rw_exit(&cmap->cmap_rwlock);

        /* Allow more translations */
        if (release_sem)
                sema_v(&cmap->cmap_throttle_sem);

}

/*
 * fssnap_create_impl() - called from the file system to create a new snapshot
 *
 *    allocates and initializes the structures needed for a new snapshot.
 *    This is called by the file system when it receives an ioctl request to
 *    create a new snapshot.  An unused snapshot identifier is either found
 *    or created, and eventually returned as the opaque handle the file
 *    system will use to identify this snapshot.  The snapshot number
 *    associated with the snapshot identifier is the same as the minor
 *    number for the snapshot device that is used to access that snapshot.
 *
 *    The snapshot can not be used until the candidate bitmap is populated
 *    by the file system (see fssnap_set_candidate_impl()), and the file
 *    system finishes the setup process by calling fssnap_create_done().
 *    Nearly all of the snapshot locks are held for the duration of the
 *    create, and are not released until fssnap_create_done is called().
 */
static void *
fssnap_create_impl(chunknumber_t nchunks, uint_t chunksz, u_offset_t maxsize,
    struct vnode *fsvp, int backfilecount, struct vnode **bfvpp, char *backpath,
    u_offset_t max_backfile_size)
{
        refstr_t *mountpoint;
        char taskqname[50];
        struct cow_info *cowp;
        struct cow_map  *cmap;
        struct snapshot_id *sidp;
        int lastsnap;

        /*
         * Sanity check the parameters we care about
         * (we don't care about the informational parameters)
         */
        if ((nchunks == 0) ||
            ((chunksz % DEV_BSIZE) != 0) ||
            (bfvpp == NULL)) {
                return (NULL);
        }

        /*
         * Look for unused snapshot identifiers.  Snapshot ids are never
         * freed, but deleted snapshot ids will be recycled as needed.
         */
        mutex_enter(&snapshot_mutex);

findagain:
        lastsnap = 0;
        for (sidp = snapshot; sidp != NULL; sidp = sidp->sid_next) {
                if (sidp->sid_snapnumber > lastsnap)
                        lastsnap = sidp->sid_snapnumber;

                /*
                 * The sid_rwlock is taken as a reader initially so that
                 * activity on each snapshot is not stalled while searching
                 * for a free snapshot id.
                 */
                rw_enter(&sidp->sid_rwlock, RW_READER);

                /*
                 * If the snapshot has been deleted and nobody is using the
                 * snapshot device than we can reuse this snapshot_id.  If
                 * the snapshot is marked to be deleted (SID_DELETE), then
                 * it hasn't been deleted yet so don't reuse it.
                 */
                if (SID_AVAILABLE(sidp))
                        break; /* This spot is unused, so take it */
                rw_exit(&sidp->sid_rwlock);
        }

        /*
         * add a new snapshot identifier if there are no deleted
         * entries.  Since it doesn't matter what order the entries
         * are in we can just add it to the beginning of the list.
         */
        if (sidp) {
                if (rw_tryupgrade(&sidp->sid_rwlock) == 0) {
                        /* someone else grabbed it as a writer, try again */
                        rw_exit(&sidp->sid_rwlock);
                        goto findagain;
                }
        } else {
                /* Create a new node if we didn't find an unused one */
                sidp = kmem_alloc(sizeof (struct snapshot_id), KM_SLEEP);
                rw_init(&sidp->sid_rwlock, NULL, RW_DEFAULT, NULL);
                rw_enter(&sidp->sid_rwlock, RW_WRITER);
                sidp->sid_snapnumber = (snapshot == NULL) ? 0 : lastsnap + 1;
                sidp->sid_cowinfo = NULL;
                sidp->sid_flags = 0;
                sidp->sid_next = snapshot;
                snapshot = sidp;
        }

        ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
        ASSERT(sidp->sid_cowinfo == NULL);
        ASSERT(sidp->sid_snapnumber <= (lastsnap + 1));

        sidp->sid_flags |= SID_CREATING;
        /* The root vnode is held until snap_delete_impl() is called */
        VN_HOLD(fsvp);
        sidp->sid_fvp = fsvp;
        num_snapshots++;

        /* allocate and initialize structures */

        cowp = kmem_zalloc(sizeof (struct cow_info), KM_SLEEP);

        cowp->cow_backfile_array = bfvpp;
        cowp->cow_backcount = backfilecount;
        cowp->cow_backfile_sz = max_backfile_size;

        /*
         * Initialize task queues for this snapshot.  Only a small number
         * of threads are required because they will be serialized on the
         * backing file's reader/writer lock anyway.
         */
        (void) snprintf(taskqname, sizeof (taskqname), "%s_taskq_%d", snapname,
            sidp->sid_snapnumber);
        cowp->cow_taskq = taskq_create(taskqname, fssnap_taskq_nthreads,
            minclsyspri, 1,  fssnap_taskq_maxtasks, 0);

        /* don't allow tasks to start until after everything is ready */
        taskq_suspend(cowp->cow_taskq);

        /* initialize translation table */
        cmap = &cowp->cow_map;
        rw_init(&cmap->cmap_rwlock, NULL, RW_DEFAULT, NULL);
        rw_enter(&cmap->cmap_rwlock, RW_WRITER);

        sema_init(&cmap->cmap_throttle_sem, fssnap_max_mem_chunks, NULL,
            SEMA_DEFAULT, NULL);

        cmap->cmap_chunksz = chunksz;
        cmap->cmap_maxsize = maxsize;
        cmap->cmap_chunksperbf = max_backfile_size / chunksz;

        /*
         * allocate one bit per chunk for the bitmaps, round up
         */
        cmap->cmap_bmsize = (nchunks + (NBBY - 1)) / NBBY;
        cmap->cmap_hastrans  = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);
        cmap->cmap_candidate = kmem_zalloc(cmap->cmap_bmsize, KM_SLEEP);

        sidp->sid_cowinfo = cowp;

        /* initialize kstats for this snapshot */
        mountpoint = vfs_getmntpoint(fsvp->v_vfsp);
        fssnap_create_kstats(sidp, sidp->sid_snapnumber,
            refstr_value(mountpoint), backpath);
        refstr_rele(mountpoint);

        mutex_exit(&snapshot_mutex);

        /*
         * return with snapshot id rwlock held as a writer until
         * fssnap_create_done is called
         */
        return (sidp);
}

/*
 * fssnap_set_candidate_impl() - mark a chunk as a candidate for copy-on-write
 *
 *    sets a bit in the candidate bitmap that indicates that a chunk is a
 *    candidate for copy-on-write.  Typically, chunks that are allocated on
 *    the file system at the time the snapshot is taken are candidates,
 *    while chunks that have no allocated data do not need to be copied.
 *    Chunks containing metadata must be marked as candidates as well.
 */
static void
fssnap_set_candidate_impl(void *snapshot_id, chunknumber_t chunknumber)
{
        struct snapshot_id      *sid = snapshot_id;
        struct cow_info *cowp = sid->sid_cowinfo;
        struct cow_map  *cmap = &cowp->cow_map;

        /* simple bitmap operation for now */
        ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
        setbit(cmap->cmap_candidate, chunknumber);
}

/*
 * fssnap_is_candidate_impl() - check whether a chunk is a candidate
 *
 *    returns 0 if the chunk is not a candidate and 1 if the chunk is a
 *    candidate.  This can be used by the file system to change behavior for
 *    chunks that might induce a copy-on-write.  The offset is specified in
 *    bytes since the chunk size may not be known by the file system.
 */
static int
fssnap_is_candidate_impl(void *snapshot_id, u_offset_t off)
{
        struct snapshot_id      *sid = snapshot_id;
        struct cow_info *cowp = sid->sid_cowinfo;
        struct cow_map  *cmap = &cowp->cow_map;
        ulong_t chunknumber = off / cmap->cmap_chunksz;

        /* simple bitmap operation for now */
        ASSERT(chunknumber < (cmap->cmap_bmsize * NBBY));
        return (isset(cmap->cmap_candidate, chunknumber));
}

/*
 * fssnap_create_done_impl() - complete the snapshot setup process
 *
 *    called when the file system is done populating the candidate bitmap
 *    and it is ready to start using the snapshot.  This routine releases
 *    the snapshot locks, allows taskq tasks to start processing, and
 *    creates the device minor nodes associated with the snapshot.
 */
static int
fssnap_create_done_impl(void *snapshot_id)
{
        struct snapshot_id      **sidpp, *sidp = snapshot_id;
        struct cow_info         *cowp;
        struct cow_map          *cmap;
        int                     snapnumber = -1;
        char                    name[20];

        /* sid rwlock and cmap rwlock should be taken from fssnap_create */
        ASSERT(sidp);
        ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));
        ASSERT(sidp->sid_cowinfo);

        cowp = sidp->sid_cowinfo;
        cmap = &cowp->cow_map;

        ASSERT(RW_WRITE_HELD(&cmap->cmap_rwlock));

        sidp->sid_flags &= ~(SID_CREATING | SID_DISABLED);
        snapnumber = sidp->sid_snapnumber;

        /* allocate state structure and find new snapshot id */
        if (ddi_soft_state_zalloc(statep, snapnumber) != DDI_SUCCESS) {
                cmn_err(CE_WARN,
                    "snap_ioctl: create: could not allocate "
                    "state for snapshot %d.", snapnumber);
                snapnumber = -1;
                goto out;
        }

        sidpp = ddi_get_soft_state(statep, snapnumber);
        *sidpp = sidp;

        /* create minor node based on snapshot number */
        ASSERT(fssnap_dip != NULL);
        (void) snprintf(name, sizeof (name), "%d", snapnumber);
        if (ddi_create_minor_node(fssnap_dip, name, S_IFBLK,
            snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
                cmn_err(CE_WARN, "snap_ioctl: could not create "
                    "block minor node for snapshot %d.", snapnumber);
                snapnumber = -1;
                goto out;
        }

        (void) snprintf(name, sizeof (name), "%d,raw", snapnumber);
        if (ddi_create_minor_node(fssnap_dip, name, S_IFCHR,
            snapnumber, DDI_PSEUDO, 0) != DDI_SUCCESS) {
                cmn_err(CE_WARN, "snap_ioctl: could not create "
                    "character minor node for snapshot %d.", snapnumber);
                snapnumber = -1;
        }

out:
        rw_exit(&sidp->sid_rwlock);
        rw_exit(&cmap->cmap_rwlock);

        /* let the taskq threads start processing */
        taskq_resume(cowp->cow_taskq);

        return (snapnumber);
}

/*
 * fssnap_delete_impl() - delete a snapshot
 *
 *    used when a snapshot is no longer needed.  This is called by the file
 *    system when it receives an ioctl request to delete a snapshot.  It is
 *    also called internally when error conditions such as disk full, errors
 *    writing to the backing file, or backing file maxsize exceeded occur.
 *    If the snapshot device is busy when the delete request is received,
 *    all state will be deleted except for the soft state and device files
 *    associated with the snapshot; they will be deleted when the snapshot
 *    device is closed.
 *
 *    NOTE this function takes a POINTER TO A POINTER to the snapshot id,
 *    and expects to be able to set the handle held by the file system to
 *    NULL.  This depends on the file system checking that variable for NULL
 *    before calling fssnap_strategy().
 */
static int
fssnap_delete_impl(void *snapshot_id)
{
        struct snapshot_id      **sidpp = (struct snapshot_id **)snapshot_id;
        struct snapshot_id      *sidp;
        struct snapshot_id      **statesidpp;
        struct cow_info         *cowp;
        struct cow_map          *cmap;
        char                    name[20];
        int                     snapnumber = -1;
        vnode_t                 **vpp;

        /*
         * sidp is guaranteed to be valid if sidpp is valid because
         * the snapshot list is append-only.
         */
        if (sidpp == NULL) {
                return (-1);
        }

        sidp = *sidpp;
        rw_enter(&sidp->sid_rwlock, RW_WRITER);

        ASSERT(RW_WRITE_HELD(&sidp->sid_rwlock));

        /*
         * double check that the snapshot is still valid for THIS file system
         */
        if (*sidpp == NULL) {
                rw_exit(&sidp->sid_rwlock);
                return (-1);
        }

        /*
         * Now we know the snapshot is still valid and will not go away
         * because we have the write lock.  Once the state is transitioned
         * to "disabling", the sid_rwlock can be released.  Any pending I/O
         * waiting for the lock as a reader will check for this state and
         * abort without touching data that may be getting freed.
         */
        sidp->sid_flags |= SID_DISABLING;
        if (sidp->sid_flags & SID_DELETE) {
                cmn_err(CE_WARN, "Snapshot %d automatically deleted.",
                    sidp->sid_snapnumber);
                sidp->sid_flags &= ~(SID_DELETE);
        }


        /*
         * This is pointing into file system specific data!  The assumption is
         * that fssnap_strategy() gets called from the file system based on
         * whether this reference to the snapshot_id is NULL or not.  So
         * setting this to NULL should disable snapshots for the file system.
         */
        *sidpp = NULL;

        /* remove cowinfo */
        cowp = sidp->sid_cowinfo;
        if (cowp == NULL) {
                rw_exit(&sidp->sid_rwlock);
                return (-1);
        }
        rw_exit(&sidp->sid_rwlock);

        /* destroy task queues first so they don't reference freed data. */
        if (cowp->cow_taskq) {
                taskq_destroy(cowp->cow_taskq);
                cowp->cow_taskq = NULL;
        }

        if (cowp->cow_backfile_array != NULL) {
                for (vpp = cowp->cow_backfile_array; *vpp; vpp++)
                        VN_RELE(*vpp);
                kmem_free(cowp->cow_backfile_array,
                    (cowp->cow_backcount + 1) * sizeof (vnode_t *));
                cowp->cow_backfile_array = NULL;
        }

        sidp->sid_cowinfo = NULL;

        /* remove cmap */
        cmap = &cowp->cow_map;
        ASSERT(cmap);

        if (cmap->cmap_candidate)
                kmem_free(cmap->cmap_candidate, cmap->cmap_bmsize);

        if (cmap->cmap_hastrans)
                kmem_free(cmap->cmap_hastrans, cmap->cmap_bmsize);

        if (cmap->cmap_table)
                transtbl_free(&cowp->cow_map);

        rw_destroy(&cmap->cmap_rwlock);

        while (cmap->cmap_waiters) {
                sema_p(&cmap->cmap_throttle_sem);
                sema_v(&cmap->cmap_throttle_sem);
        }
        sema_destroy(&cmap->cmap_throttle_sem);

        /* remove kstats */
        fssnap_delete_kstats(cowp);

        kmem_free(cowp, sizeof (struct cow_info));

        statesidpp = ddi_get_soft_state(statep, sidp->sid_snapnumber);
        if (statesidpp == NULL || *statesidpp == NULL) {
                cmn_err(CE_WARN,
                    "fssnap_delete_impl: could not find state for snapshot %d.",
                    sidp->sid_snapnumber);
        }
        ASSERT(*statesidpp == sidp);

        /*
         * Leave the node in the list marked DISABLED so it can be reused
         * and avoid many race conditions.  Return the snapshot number
         * that was deleted.
         */
        mutex_enter(&snapshot_mutex);
        rw_enter(&sidp->sid_rwlock, RW_WRITER);
        sidp->sid_flags &= ~(SID_DISABLING);
        sidp->sid_flags |= SID_DISABLED;
        VN_RELE(sidp->sid_fvp);
        sidp->sid_fvp = NULL;
        snapnumber = sidp->sid_snapnumber;

        /*
         * If the snapshot is not busy, free the device info now.  Otherwise
         * the device nodes are freed in snap_close() when the device is
         * closed.  The sid will not be reused until the device is not busy.
         */
        if (SID_AVAILABLE(sidp)) {
                /* remove the device nodes */
                ASSERT(fssnap_dip != NULL);
                (void) snprintf(name, sizeof (name), "%d",
                    sidp->sid_snapnumber);
                ddi_remove_minor_node(fssnap_dip, name);
                (void) snprintf(name, sizeof (name), "%d,raw",
                    sidp->sid_snapnumber);
                ddi_remove_minor_node(fssnap_dip, name);

                /* delete the state structure */
                ddi_soft_state_free(statep, sidp->sid_snapnumber);
                num_snapshots--;
        }

        mutex_exit(&snapshot_mutex);
        rw_exit(&sidp->sid_rwlock);

        return (snapnumber);
}

/*
 * fssnap_create_kstats() - allocate and initialize snapshot kstats
 *
 */
static void
fssnap_create_kstats(snapshot_id_t *sidp, int snapnum,
    const char *mountpoint, const char *backfilename)
{
        kstat_t *num, *mntpoint, *bfname;
        kstat_named_t *hw;
        struct cow_info *cowp = sidp->sid_cowinfo;
        struct cow_kstat_num *stats;

        /* update the high water mark */
        if (fssnap_highwater_kstat == NULL) {
                cmn_err(CE_WARN, "fssnap_create_kstats: failed to lookup "
                    "high water mark kstat.");
                return;
        }

        hw = (kstat_named_t *)fssnap_highwater_kstat->ks_data;
        if (hw->value.ui32 < snapnum)
                hw->value.ui32 = snapnum;

        /* initialize the mount point kstat */
        kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_MNTPT);

        if (mountpoint != NULL) {
                mntpoint = kstat_create(snapname, snapnum, FSSNAP_KSTAT_MNTPT,
                    "misc", KSTAT_TYPE_RAW, strlen(mountpoint) + 1, 0);
                if (mntpoint == NULL) {
                        cowp->cow_kstat_mntpt = NULL;
                        cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
                            "create mount point kstat");
                } else {
                        (void) strncpy(mntpoint->ks_data, mountpoint,
                            strlen(mountpoint));
                        cowp->cow_kstat_mntpt = mntpoint;
                        kstat_install(mntpoint);
                }
        } else {
                cowp->cow_kstat_mntpt = NULL;
                cmn_err(CE_WARN, "fssnap_create_kstats: mount point not "
                    "specified.");
        }

        /* initialize the backing file kstat */
        kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_BFNAME);

        if (backfilename == NULL) {
                cowp->cow_kstat_bfname = NULL;
        } else {
                bfname = kstat_create(snapname, snapnum, FSSNAP_KSTAT_BFNAME,
                    "misc", KSTAT_TYPE_RAW, strlen(backfilename) + 1, 0);
                if (bfname != NULL) {
                        (void) strncpy(bfname->ks_data, backfilename,
                            strlen(backfilename));
                        cowp->cow_kstat_bfname = bfname;
                        kstat_install(bfname);
                } else {
                        cowp->cow_kstat_bfname = NULL;
                        cmn_err(CE_WARN, "fssnap_create_kstats: failed to "
                            "create backing file name kstat");
                }
        }

        /* initialize numeric kstats */
        kstat_delete_byname(snapname, snapnum, FSSNAP_KSTAT_NUM);

        num = kstat_create(snapname, snapnum, FSSNAP_KSTAT_NUM,
            "misc", KSTAT_TYPE_NAMED,
            sizeof (struct cow_kstat_num) / sizeof (kstat_named_t),
            0);
        if (num == NULL) {
                cmn_err(CE_WARN, "fssnap_create_kstats: failed to create "
                    "numeric kstats");
                cowp->cow_kstat_num = NULL;
                return;
        }

        cowp->cow_kstat_num = num;
        stats = num->ks_data;
        num->ks_update = fssnap_update_kstat_num;
        num->ks_private = sidp;

        kstat_named_init(&stats->ckn_state, FSSNAP_KSTAT_NUM_STATE,
            KSTAT_DATA_INT32);
        kstat_named_init(&stats->ckn_bfsize, FSSNAP_KSTAT_NUM_BFSIZE,
            KSTAT_DATA_UINT64);
        kstat_named_init(&stats->ckn_maxsize, FSSNAP_KSTAT_NUM_MAXSIZE,
            KSTAT_DATA_UINT64);
        kstat_named_init(&stats->ckn_createtime, FSSNAP_KSTAT_NUM_CREATETIME,
            KSTAT_DATA_LONG);
        kstat_named_init(&stats->ckn_chunksize, FSSNAP_KSTAT_NUM_CHUNKSIZE,
            KSTAT_DATA_UINT32);

        /* initialize the static kstats */
        stats->ckn_chunksize.value.ui32 = cowp->cow_map.cmap_chunksz;
        stats->ckn_maxsize.value.ui64 = cowp->cow_map.cmap_maxsize;
        stats->ckn_createtime.value.l = gethrestime_sec();

        kstat_install(num);
}

/*
 * fssnap_update_kstat_num() - update a numerical snapshot kstat value
 *
 */
int
fssnap_update_kstat_num(kstat_t *ksp, int rw)
{
        snapshot_id_t *sidp = (snapshot_id_t *)ksp->ks_private;
        struct cow_info *cowp = sidp->sid_cowinfo;
        struct cow_kstat_num *stats = ksp->ks_data;

        if (rw == KSTAT_WRITE)
                return (EACCES);

        /* state */
        if (sidp->sid_flags & SID_CREATING)
                stats->ckn_state.value.i32 = COWSTATE_CREATING;
        else if (SID_INACTIVE(sidp))
                stats->ckn_state.value.i32 = COWSTATE_DISABLED;
        else if (SID_BUSY(sidp))
                stats->ckn_state.value.i32 = COWSTATE_ACTIVE;
        else
                stats->ckn_state.value.i32 = COWSTATE_IDLE;

        /* bfsize */
        stats->ckn_bfsize.value.ui64 = cowp->cow_map.cmap_nchunks *
            cowp->cow_map.cmap_chunksz;

        return (0);
}

/*
 * fssnap_delete_kstats() - deallocate snapshot kstats
 *
 */
void
fssnap_delete_kstats(struct cow_info *cowp)
{
        if (cowp->cow_kstat_num != NULL) {
                kstat_delete(cowp->cow_kstat_num);
                cowp->cow_kstat_num = NULL;
        }
        if (cowp->cow_kstat_mntpt != NULL) {
                kstat_delete(cowp->cow_kstat_mntpt);
                cowp->cow_kstat_mntpt = NULL;
        }
        if (cowp->cow_kstat_bfname != NULL) {
                kstat_delete(cowp->cow_kstat_bfname);
                cowp->cow_kstat_bfname = NULL;
        }
}