root/usr/src/uts/common/os/devcache.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/note.h>
#include <sys/t_lock.h>
#include <sys/cmn_err.h>
#include <sys/instance.h>
#include <sys/conf.h>
#include <sys/stat.h>
#include <sys/ddi.h>
#include <sys/hwconf.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/ddi_impldefs.h>
#include <sys/ndi_impldefs.h>
#include <sys/modctl.h>
#include <sys/dacf.h>
#include <sys/promif.h>
#include <sys/cpuvar.h>
#include <sys/pathname.h>
#include <sys/kobj.h>
#include <sys/devcache.h>
#include <sys/devcache_impl.h>
#include <sys/sysmacros.h>
#include <sys/varargs.h>
#include <sys/callb.h>

/*
 * This facility provides interfaces to clients to register,
 * read and update cache data in persisted backing store files,
 * usually in /etc/devices.  The data persisted through this
 * mechanism should be stateless data, functioning in the sense
 * of a cache.  Writes are performed by a background daemon
 * thread, permitting a client to schedule an update without
 * blocking, then continue updating the data state in
 * parallel.  The data is only locked by the daemon thread
 * to pack the data in preparation for the write.
 *
 * Data persisted through this mechanism should be capable
 * of being regenerated through normal system operation,
 * for example attaching all disk devices would cause all
 * devids to be registered for those devices.  By caching
 * a devid-device tuple, the system can operate in a
 * more optimal way, directly attaching the device mapped
 * to a devid, rather than burdensomely driving attach of
 * the entire device tree to discover a single device.
 *
 * Note that a client should only need to include
 * <sys/devcache.h> for the supported interfaces.
 *
 * The data per client is entirely within the control of
 * the client.  When reading, data unpacked from the backing
 * store should be inserted in the list.  The pointer to
 * the list can be retrieved via nvf_list().  When writing,
 * the data on the list is to be packed and returned to the
 * nvpdaemon as an nvlist.
 *
 * Obvious restrictions are imposed by the limits of the
 * nvlist format.  The data cannot be read or written
 * piecemeal, and large amounts of data aren't recommended.
 * However, nvlists do allow that data be named and typed
 * and can be size-of-int invariant, and the cached data
 * can be versioned conveniently.
 *
 * The registration involves two steps: a handle is
 * allocated by calling the registration function.
 * This sets up the data referenced by the handle and
 * initializes the lock.  Following registration, the
 * client must initialize the data list.  The list
 * interfaces require that the list element with offset
 * to the node link be provided.  The format of the
 * list element is under the control of the client.
 *
 * Locking: the address of the data list r/w lock provided
 * can be accessed with nvf_lock().  The lock must be held
 * as reader when traversing the list or checking state,
 * such as nvf_is_dirty().  The lock must be held as
 * writer when updating the list or marking it dirty.
 * The lock must not be held when waking the daemon.
 *
 * The data r/w lock is held as writer when the pack,
 * unpack and free list handlers are called.  The
 * lock should not be dropped and must be still held
 * upon return.  The client should also hold the lock
 * as reader when checking if the list is dirty, and
 * as writer when marking the list dirty or initiating
 * a read.
 *
 * The asynchronous nature of updates allows for the
 * possibility that the data may continue to be updated
 * once the daemon has been notified that an update is
 * desired.  The data only needs to be locked against
 * updates when packing the data into the form to be
 * written.  When the write of the packed data has
 * completed, the daemon will automatically reschedule
 * an update if the data was marked dirty after the
 * point at which it was packed.  Before beginning an
 * update, the daemon attempts to lock the data as
 * writer; if the writer lock is already held, it
 * backs off and retries later.  The model is to give
 * priority to the kernel processes generating the
 * data, and that the nature of the data is that
 * it does not change often, can be re-generated when
 * needed, so updates should not happen often and
 * can be delayed until the data stops changing.
 * The client may update the list or mark it dirty
 * any time it is able to acquire the lock as
 * writer first.
 *
 * A failed write will be retried after some delay,
 * in the hope that the cause of the error will be
 * transient, for example a filesystem with no space
 * available.  An update on a read-only filesystem
 * is failed silently and not retried; this would be
 * the case when booted off install media.
 *
 * There is no unregister mechanism as of yet, as it
 * hasn't been needed so far.
 */

/*
 * Global list of files registered and updated by the nvpflush
 * daemon, protected by the nvf_cache_mutex.  While an
 * update is taking place, a file is temporarily moved to
 * the dirty list to avoid locking the primary list for
 * the duration of the update.
 */
list_t          nvf_cache_files;
list_t          nvf_dirty_files;
kmutex_t        nvf_cache_mutex;


/*
 * Allow some delay from an update of the data before flushing
 * to permit simultaneous updates of multiple changes.
 * Changes in the data are expected to be bursty, ie
 * reconfig or hot-plug of a new adapter.
 *
 * kfio_report_error (default 0)
 *      Set to 1 to enable some error messages related to low-level
 *      kernel file i/o operations.
 *
 * nvpflush_delay (default 10)
 *      The number of seconds after data is marked dirty before the
 *      flush daemon is triggered to flush the data.  A longer period
 *      of time permits more data updates per write.  Note that
 *      every update resets the timer so no repository write will
 *      occur while data is being updated continuously.
 *
 * nvpdaemon_idle_time (default 60)
 *      The number of seconds the daemon will sleep idle before exiting.
 *
 */
#define NVPFLUSH_DELAY          10
#define NVPDAEMON_IDLE_TIME     60

#define TICKS_PER_SECOND        (drv_usectohz(1000000))

/*
 * Tunables
 */
int kfio_report_error = 0;              /* kernel file i/o operations */
int kfio_disable_read = 0;              /* disable all reads */
int kfio_disable_write = 0;             /* disable all writes */

int nvpflush_delay      = NVPFLUSH_DELAY;
int nvpdaemon_idle_time = NVPDAEMON_IDLE_TIME;

static timeout_id_t     nvpflush_id = 0;
static int              nvpflush_timer_busy = 0;
static int              nvpflush_daemon_active = 0;
static kthread_t        *nvpflush_thr_id = 0;

static int              do_nvpflush = 0;
static int              nvpbusy = 0;
static kmutex_t         nvpflush_lock;
static kcondvar_t       nvpflush_cv;
static kthread_id_t     nvpflush_thread;
static clock_t          nvpticks;

static void nvpflush_daemon(void);

#ifdef  DEBUG
int nvpdaemon_debug = 0;
int kfio_debug = 0;
#endif  /* DEBUG */

extern int modrootloaded;
extern void mdi_read_devices_files(void);
extern void mdi_clean_vhcache(void);
extern int sys_shutdown;

/*
 * Initialize the overall cache file management
 */
void
i_ddi_devices_init(void)
{
        list_create(&nvf_cache_files, sizeof (nvfd_t),
            offsetof(nvfd_t, nvf_link));
        list_create(&nvf_dirty_files, sizeof (nvfd_t),
            offsetof(nvfd_t, nvf_link));
        mutex_init(&nvf_cache_mutex, NULL, MUTEX_DEFAULT, NULL);
        retire_store_init();
        devid_cache_init();
}

/*
 * Read cache files
 * The files read here should be restricted to those
 * that may be required to mount root.
 */
void
i_ddi_read_devices_files(void)
{
        /*
         * The retire store should be the first file read as it
         * may need to offline devices. kfio_disable_read is not
         * used for retire. For the rationale see the tunable
         * ddi_retire_store_bypass and comments in:
         *      uts/common/os/retire_store.c
         */

        retire_store_read();

        if (!kfio_disable_read) {
                mdi_read_devices_files();
                devid_cache_read();
        }
}

void
i_ddi_start_flush_daemon(void)
{
        nvfd_t  *nvfdp;

        ASSERT(i_ddi_io_initialized());

        mutex_init(&nvpflush_lock, NULL, MUTEX_DRIVER, NULL);
        cv_init(&nvpflush_cv, NULL, CV_DRIVER, NULL);

        mutex_enter(&nvf_cache_mutex);
        for (nvfdp = list_head(&nvf_cache_files); nvfdp;
            nvfdp = list_next(&nvf_cache_files, nvfdp)) {
                if (NVF_IS_DIRTY(nvfdp)) {
                        nvf_wake_daemon();
                        break;
                }
        }
        mutex_exit(&nvf_cache_mutex);
}

void
i_ddi_clean_devices_files(void)
{
        devid_cache_cleanup();
        mdi_clean_vhcache();
}

/*
 * Register a cache file to be managed and updated by the nvpflush daemon.
 * All operations are performed through the returned handle.
 * There is no unregister mechanism for now.
 */
nvf_handle_t
nvf_register_file(nvf_ops_t *ops)
{
        nvfd_t *nvfdp;

        nvfdp = kmem_zalloc(sizeof (*nvfdp), KM_SLEEP);

        nvfdp->nvf_ops = ops;
        nvfdp->nvf_flags = 0;
        rw_init(&nvfdp->nvf_lock, NULL, RW_DRIVER, NULL);

        mutex_enter(&nvf_cache_mutex);
        list_insert_tail(&nvf_cache_files, nvfdp);
        mutex_exit(&nvf_cache_mutex);

        return ((nvf_handle_t)nvfdp);
}

/*PRINTFLIKE1*/
void
nvf_error(const char *fmt, ...)
{
        va_list ap;

        if (kfio_report_error) {
                va_start(ap, fmt);
                vcmn_err(CE_NOTE, fmt, ap);
                va_end(ap);
        }
}

/*
 * Some operations clients may use to manage the data
 * to be persisted in a cache file.
 */
char *
nvf_cache_name(nvf_handle_t handle)
{
        return (((nvfd_t *)handle)->nvf_cache_path);
}

krwlock_t *
nvf_lock(nvf_handle_t handle)
{
        return (&(((nvfd_t *)handle)->nvf_lock));
}

list_t *
nvf_list(nvf_handle_t handle)
{
        return (&(((nvfd_t *)handle)->nvf_data_list));
}

void
nvf_mark_dirty(nvf_handle_t handle)
{
        ASSERT(RW_WRITE_HELD(&(((nvfd_t *)handle)->nvf_lock)));
        NVF_MARK_DIRTY((nvfd_t *)handle);
}

int
nvf_is_dirty(nvf_handle_t handle)
{
        ASSERT(RW_LOCK_HELD(&(((nvfd_t *)handle)->nvf_lock)));
        return (NVF_IS_DIRTY((nvfd_t *)handle));
}

static uint16_t
nvp_cksum(uchar_t *buf, int64_t buflen)
{
        uint16_t cksum = 0;
        uint16_t *p = (uint16_t *)buf;
        int64_t n;

        if ((buflen & 0x01) != 0) {
                buflen--;
                cksum = buf[buflen];
        }
        n = buflen / 2;
        while (n-- > 0)
                cksum ^= *p++;
        return (cksum);
}

int
fread_nvlist(char *filename, nvlist_t **ret_nvlist)
{
        struct _buf     *file;
        nvpf_hdr_t      hdr;
        char            *buf;
        nvlist_t        *nvl;
        int             rval;
        uint_t          offset;
        int             n;
        char            c;
        uint16_t        cksum, hdrsum;

        *ret_nvlist = NULL;

        file = kobj_open_file(filename);
        if (file == (struct _buf *)-1) {
                KFDEBUG((CE_CONT, "cannot open file: %s\n", filename));
                return (ENOENT);
        }

        offset = 0;
        n = kobj_read_file(file, (char *)&hdr, sizeof (hdr), offset);
        if (n != sizeof (hdr)) {
                kobj_close_file(file);
                if (n < 0) {
                        nvf_error("error reading header: %s\n", filename);
                        return (EIO);
                } else if (n == 0) {
                        KFDEBUG((CE_CONT, "file empty: %s\n", filename));
                } else {
                        nvf_error("header size incorrect: %s\n", filename);
                }
                return (EINVAL);
        }
        offset += n;

        KFDEBUG2((CE_CONT, "nvpf_magic: 0x%x\n", hdr.nvpf_magic));
        KFDEBUG2((CE_CONT, "nvpf_version: %d\n", hdr.nvpf_version));
        KFDEBUG2((CE_CONT, "nvpf_size: %lld\n",
            (longlong_t)hdr.nvpf_size));
        KFDEBUG2((CE_CONT, "nvpf_hdr_chksum: 0x%x\n",
            hdr.nvpf_hdr_chksum));
        KFDEBUG2((CE_CONT, "nvpf_chksum: 0x%x\n", hdr.nvpf_chksum));

        cksum = hdr.nvpf_hdr_chksum;
        hdr.nvpf_hdr_chksum = 0;
        hdrsum = nvp_cksum((uchar_t *)&hdr, sizeof (hdr));

        if (hdr.nvpf_magic != NVPF_HDR_MAGIC ||
            hdr.nvpf_version != NVPF_HDR_VERSION || hdrsum != cksum) {
                kobj_close_file(file);
                if (hdrsum != cksum) {
                        nvf_error("%s: checksum error "
                            "(actual 0x%x, expected 0x%x)\n",
                            filename, hdrsum, cksum);
                }
                nvf_error("%s: header information incorrect", filename);
                return (EINVAL);
        }

        ASSERT(hdr.nvpf_size >= 0);

        buf = kmem_alloc(hdr.nvpf_size, KM_SLEEP);
        n = kobj_read_file(file, buf, hdr.nvpf_size, offset);
        if (n != hdr.nvpf_size) {
                kmem_free(buf, hdr.nvpf_size);
                kobj_close_file(file);
                if (n < 0) {
                        nvf_error("%s: read error %d", filename, n);
                } else {
                        nvf_error("%s: incomplete read %d/%lld",
                            filename, n, (longlong_t)hdr.nvpf_size);
                }
                return (EINVAL);
        }
        offset += n;

        rval = kobj_read_file(file, &c, 1, offset);
        kobj_close_file(file);
        if (rval > 0) {
                nvf_error("%s is larger than %lld\n",
                    filename, (longlong_t)hdr.nvpf_size);
                kmem_free(buf, hdr.nvpf_size);
                return (EINVAL);
        }

        cksum = nvp_cksum((uchar_t *)buf, hdr.nvpf_size);
        if (hdr.nvpf_chksum != cksum) {
                nvf_error("%s: checksum error (actual 0x%x, expected 0x%x)\n",
                    filename, hdr.nvpf_chksum, cksum);
                kmem_free(buf, hdr.nvpf_size);
                return (EINVAL);
        }

        nvl = NULL;
        rval = nvlist_unpack(buf, hdr.nvpf_size, &nvl, 0);
        if (rval != 0) {
                nvf_error("%s: error %d unpacking nvlist\n",
                    filename, rval);
                kmem_free(buf, hdr.nvpf_size);
                return (EINVAL);
        }

        kmem_free(buf, hdr.nvpf_size);
        *ret_nvlist = nvl;
        return (0);
}

static int
kfcreate(char *filename, kfile_t **kfilep)
{
        kfile_t *fp;
        int     rval;

        ASSERT(modrootloaded);

        fp = kmem_alloc(sizeof (kfile_t), KM_SLEEP);

        fp->kf_vnflags = FCREAT | FWRITE | FTRUNC;
        fp->kf_fname = filename;
        fp->kf_fpos = 0;
        fp->kf_state = 0;

        KFDEBUG((CE_CONT, "create: %s flags 0x%x\n",
            filename, fp->kf_vnflags));
        rval = vn_open(filename, UIO_SYSSPACE, fp->kf_vnflags,
            0444, &fp->kf_vp, CRCREAT, 0);
        if (rval != 0) {
                kmem_free(fp, sizeof (kfile_t));
                KFDEBUG((CE_CONT, "%s: create error %d\n",
                    filename, rval));
                return (rval);
        }

        *kfilep = fp;
        return (0);
}

static int
kfremove(char *filename)
{
        int rval;

        KFDEBUG((CE_CONT, "remove: %s\n", filename));
        rval = vn_remove(filename, UIO_SYSSPACE, RMFILE);
        if (rval != 0) {
                KFDEBUG((CE_CONT, "%s: remove error %d\n",
                    filename, rval));
        }
        return (rval);
}

static int
kfread(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
{
        ssize_t         resid;
        int             err;
        ssize_t         n;

        ASSERT(modrootloaded);

        if (fp->kf_state != 0)
                return (fp->kf_state);

        err = vn_rdwr(UIO_READ, fp->kf_vp, buf, bufsiz, fp->kf_fpos,
            UIO_SYSSPACE, 0, (rlim64_t)0, kcred, &resid);
        if (err != 0) {
                KFDEBUG((CE_CONT, "%s: read error %d\n",
                    fp->kf_fname, err));
                fp->kf_state = err;
                return (err);
        }

        ASSERT(resid >= 0 && resid <= bufsiz);
        n = bufsiz - resid;

        KFDEBUG1((CE_CONT, "%s: read %ld bytes ok %ld bufsiz, %ld resid\n",
            fp->kf_fname, n, bufsiz, resid));

        fp->kf_fpos += n;
        *ret_n = n;
        return (0);
}

static int
kfwrite(kfile_t *fp, char *buf, ssize_t bufsiz, ssize_t *ret_n)
{
        rlim64_t        rlimit;
        ssize_t         resid;
        int             err;
        ssize_t         len;
        ssize_t         n = 0;

        ASSERT(modrootloaded);

        if (fp->kf_state != 0)
                return (fp->kf_state);

        len = bufsiz;
        rlimit = bufsiz + 1;
        for (;;) {
                err = vn_rdwr(UIO_WRITE, fp->kf_vp, buf, len, fp->kf_fpos,
                    UIO_SYSSPACE, FSYNC, rlimit, kcred, &resid);
                if (err) {
                        KFDEBUG((CE_CONT, "%s: write error %d\n",
                            fp->kf_fname, err));
                        fp->kf_state = err;
                        return (err);
                }

                KFDEBUG1((CE_CONT, "%s: write %ld bytes ok %ld resid\n",
                    fp->kf_fname, len-resid, resid));

                ASSERT(resid >= 0 && resid <= len);

                n += (len - resid);
                if (resid == 0)
                        break;

                if (resid == len) {
                        KFDEBUG((CE_CONT, "%s: filesystem full?\n",
                            fp->kf_fname));
                        fp->kf_state = ENOSPC;
                        return (ENOSPC);
                }

                len -= resid;
                buf += len;
                fp->kf_fpos += len;
                len = resid;
        }

        ASSERT(n == bufsiz);
        KFDEBUG1((CE_CONT, "%s: wrote %ld bytes ok\n", fp->kf_fname, n));

        *ret_n = n;
        return (0);
}


static int
kfclose(kfile_t *fp)
{
        int             rval;

        KFDEBUG((CE_CONT, "close: %s\n", fp->kf_fname));

        if ((fp->kf_vnflags & FWRITE) && fp->kf_state == 0) {
                rval = VOP_FSYNC(fp->kf_vp, FSYNC, kcred, NULL);
                if (rval != 0) {
                        nvf_error("%s: sync error %d\n",
                            fp->kf_fname, rval);
                }
                KFDEBUG((CE_CONT, "%s: sync ok\n", fp->kf_fname));
        }

        rval = VOP_CLOSE(fp->kf_vp, fp->kf_vnflags, 1,
            (offset_t)0, kcred, NULL);
        if (rval != 0) {
                if (fp->kf_state == 0) {
                        nvf_error("%s: close error %d\n",
                            fp->kf_fname, rval);
                }
        } else {
                if (fp->kf_state == 0)
                        KFDEBUG((CE_CONT, "%s: close ok\n", fp->kf_fname));
        }

        VN_RELE(fp->kf_vp);
        kmem_free(fp, sizeof (kfile_t));
        return (rval);
}

static int
kfrename(char *oldname, char *newname)
{
        int rval;

        ASSERT(modrootloaded);

        KFDEBUG((CE_CONT, "renaming %s to %s\n", oldname, newname));

        if ((rval = vn_rename(oldname, newname, UIO_SYSSPACE)) != 0) {
                KFDEBUG((CE_CONT, "rename %s to %s: %d\n",
                    oldname, newname, rval));
        }

        return (rval);
}

int
fwrite_nvlist(char *filename, nvlist_t *nvl)
{
        char    *buf;
        char    *nvbuf;
        kfile_t *fp;
        char    *newname;
        int     len, err, err1;
        size_t  buflen;
        ssize_t n;

        ASSERT(modrootloaded);

        nvbuf = NULL;
        err = nvlist_pack(nvl, &nvbuf, &buflen, NV_ENCODE_NATIVE, 0);
        if (err != 0) {
                nvf_error("%s: error %d packing nvlist\n",
                    filename, err);
                return (err);
        }

        buf = kmem_alloc(sizeof (nvpf_hdr_t) + buflen, KM_SLEEP);
        bzero(buf, sizeof (nvpf_hdr_t));

        ((nvpf_hdr_t *)buf)->nvpf_magic = NVPF_HDR_MAGIC;
        ((nvpf_hdr_t *)buf)->nvpf_version = NVPF_HDR_VERSION;
        ((nvpf_hdr_t *)buf)->nvpf_size = buflen;
        ((nvpf_hdr_t *)buf)->nvpf_chksum = nvp_cksum((uchar_t *)nvbuf, buflen);
        ((nvpf_hdr_t *)buf)->nvpf_hdr_chksum =
            nvp_cksum((uchar_t *)buf, sizeof (nvpf_hdr_t));

        bcopy(nvbuf, buf + sizeof (nvpf_hdr_t), buflen);
        kmem_free(nvbuf, buflen);
        buflen += sizeof (nvpf_hdr_t);

        len = strlen(filename) + MAX_SUFFIX_LEN + 2;
        newname = kmem_alloc(len, KM_SLEEP);


        (void) sprintf(newname, "%s.%s", filename, NEW_FILENAME_SUFFIX);

        /*
         * To make it unlikely we suffer data loss, write
         * data to the new temporary file.  Once successful
         * complete the transaction by renaming the new file
         * to replace the previous.
         */

        if ((err = kfcreate(newname, &fp)) == 0) {
                err = kfwrite(fp, buf, buflen, &n);
                if (err) {
                        nvf_error("%s: write error - %d\n",
                            newname, err);
                } else {
                        if (n != buflen) {
                                nvf_error(
                                    "%s: partial write %ld of %ld bytes\n",
                                    newname, n, buflen);
                                nvf_error("%s: filesystem may be full?\n",
                                    newname);
                                err = EIO;
                        }
                }
                if ((err1 = kfclose(fp)) != 0) {
                        nvf_error("%s: close error\n", newname);
                        if (err == 0)
                                err = err1;
                }
                if (err != 0) {
                        if (kfremove(newname) != 0) {
                                nvf_error("%s: remove failed\n",
                                    newname);
                        }
                }
        } else {
                nvf_error("%s: create failed - %d\n", filename, err);
        }

        if (err == 0) {
                if ((err = kfrename(newname, filename)) != 0) {
                        nvf_error("%s: rename from %s failed\n",
                            newname, filename);
                }
        }

        kmem_free(newname, len);
        kmem_free(buf, buflen);

        return (err);
}

static int
e_fwrite_nvlist(nvfd_t *nvfd, nvlist_t *nvl)
{
        int err;

        if ((err = fwrite_nvlist(nvfd->nvf_cache_path, nvl)) == 0)
                return (DDI_SUCCESS);
        else {
                if (err == EROFS)
                        NVF_MARK_READONLY(nvfd);
                return (DDI_FAILURE);
        }
}

static void
nvp_list_free(nvfd_t *nvf)
{
        ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
        (nvf->nvf_list_free)((nvf_handle_t)nvf);
        ASSERT(RW_WRITE_HELD(&nvf->nvf_lock));
}

/*
 * Read a file in the nvlist format
 *      EIO - i/o error during read
 *      ENOENT - file not found
 *      EINVAL - file contents corrupted
 */
static int
fread_nvp_list(nvfd_t *nvfd)
{
        nvlist_t        *nvl;
        nvpair_t        *nvp;
        char            *name;
        nvlist_t        *sublist;
        int             rval;
        int             rv;

        ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));

        rval = fread_nvlist(nvfd->nvf_cache_path, &nvl);
        if (rval != 0)
                return (rval);
        ASSERT(nvl != NULL);

        nvp = NULL;
        while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) {
                name = nvpair_name(nvp);
                ASSERT(strlen(name) > 0);

                switch (nvpair_type(nvp)) {
                case DATA_TYPE_NVLIST:
                        rval = nvpair_value_nvlist(nvp, &sublist);
                        if (rval != 0) {
                                nvf_error(
                                    "nvpair_value_nvlist error %s %d\n",
                                    name, rval);
                                goto error;
                        }

                        /*
                         * unpack nvlist for this device and
                         * add elements to data list.
                         */
                        ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
                        rv = (nvfd->nvf_unpack_nvlist)
                            ((nvf_handle_t)nvfd, sublist, name);
                        ASSERT(RW_WRITE_HELD(&(nvfd->nvf_lock)));
                        if (rv != 0) {
                                nvf_error(
                                    "%s: %s invalid list element\n",
                                    nvfd->nvf_cache_path, name);
                                rval = EINVAL;
                                goto error;
                        }
                        break;

                default:
                        nvf_error("%s: %s unsupported data type %d\n",
                            nvfd->nvf_cache_path, name, nvpair_type(nvp));
                        rval = EINVAL;
                        goto error;
                }
        }

        nvlist_free(nvl);

        return (0);

error:
        nvlist_free(nvl);
        nvp_list_free(nvfd);
        return (rval);
}


int
nvf_read_file(nvf_handle_t nvf_handle)
{
        nvfd_t *nvfd = (nvfd_t *)nvf_handle;
        int rval;

        ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));

        if (kfio_disable_read)
                return (0);

        KFDEBUG((CE_CONT, "reading %s\n", nvfd->nvf_cache_path));

        rval = fread_nvp_list(nvfd);
        if (rval) {
                switch (rval) {
                case EIO:
                        nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
                        cmn_err(CE_WARN, "%s: I/O error",
                            nvfd->nvf_cache_path);
                        break;
                case ENOENT:
                        nvfd->nvf_flags |= NVF_F_CREATE_MSG;
                        nvf_error("%s: not found\n",
                            nvfd->nvf_cache_path);
                        break;
                case EINVAL:
                default:
                        nvfd->nvf_flags |= NVF_F_REBUILD_MSG;
                        cmn_err(CE_WARN, "%s: data file corrupted",
                            nvfd->nvf_cache_path);
                        break;
                }
        }
        return (rval);
}

static void
nvf_write_is_complete(nvfd_t *fd)
{
        if (fd->nvf_write_complete) {
                (fd->nvf_write_complete)((nvf_handle_t)fd);
        }
}

/*ARGSUSED*/
static void
nvpflush_timeout(void *arg)
{
        clock_t nticks;

        mutex_enter(&nvpflush_lock);
        nticks = nvpticks - ddi_get_lbolt();
        if (nticks > 4) {
                nvpflush_timer_busy = 1;
                mutex_exit(&nvpflush_lock);
                nvpflush_id = timeout(nvpflush_timeout, NULL, nticks);
        } else {
                do_nvpflush = 1;
                NVPDAEMON_DEBUG((CE_CONT, "signal nvpdaemon\n"));
                cv_signal(&nvpflush_cv);
                nvpflush_id = 0;
                nvpflush_timer_busy = 0;
                mutex_exit(&nvpflush_lock);
        }
}

/*
 * After marking a list as dirty, wake the nvpflush daemon
 * to perform the update.
 */
void
nvf_wake_daemon(void)
{
        clock_t nticks;

        /*
         * If the system isn't up yet or is shutting down,
         * don't even think about starting a flush.
         */
        if (!i_ddi_io_initialized() || sys_shutdown)
                return;

        mutex_enter(&nvpflush_lock);

        if (nvpflush_daemon_active == 0) {
                nvpflush_daemon_active = 1;
                mutex_exit(&nvpflush_lock);
                NVPDAEMON_DEBUG((CE_CONT, "starting nvpdaemon thread\n"));
                nvpflush_thr_id = thread_create(NULL, 0,
                    (void (*)())nvpflush_daemon,
                    NULL, 0, &p0, TS_RUN, minclsyspri);
                mutex_enter(&nvpflush_lock);
        }

        nticks = nvpflush_delay * TICKS_PER_SECOND;
        nvpticks = ddi_get_lbolt() + nticks;
        if (nvpflush_timer_busy == 0) {
                nvpflush_timer_busy = 1;
                mutex_exit(&nvpflush_lock);
                nvpflush_id = timeout(nvpflush_timeout, NULL, nticks + 4);
        } else
                mutex_exit(&nvpflush_lock);
}

static int
nvpflush_one(nvfd_t *nvfd)
{
        int rval = DDI_SUCCESS;
        nvlist_t *nvl;

        rw_enter(&nvfd->nvf_lock, RW_READER);

        ASSERT((nvfd->nvf_flags & NVF_F_FLUSHING) == 0);

        if (!NVF_IS_DIRTY(nvfd) ||
            NVF_IS_READONLY(nvfd) || kfio_disable_write || sys_shutdown) {
                NVF_CLEAR_DIRTY(nvfd);
                rw_exit(&nvfd->nvf_lock);
                return (DDI_SUCCESS);
        }

        if (rw_tryupgrade(&nvfd->nvf_lock) == 0) {
                nvf_error("nvpflush: "
                    "%s rw upgrade failed\n", nvfd->nvf_cache_path);
                rw_exit(&nvfd->nvf_lock);
                return (DDI_FAILURE);
        }
        if (((nvfd->nvf_pack_list)
            ((nvf_handle_t)nvfd, &nvl)) != DDI_SUCCESS) {
                nvf_error("nvpflush: "
                    "%s nvlist construction failed\n", nvfd->nvf_cache_path);
                ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));
                rw_exit(&nvfd->nvf_lock);
                return (DDI_FAILURE);
        }
        ASSERT(RW_WRITE_HELD(&nvfd->nvf_lock));

        NVF_CLEAR_DIRTY(nvfd);
        nvfd->nvf_flags |= NVF_F_FLUSHING;
        rw_exit(&nvfd->nvf_lock);

        rval = e_fwrite_nvlist(nvfd, nvl);
        nvlist_free(nvl);

        rw_enter(&nvfd->nvf_lock, RW_WRITER);
        nvfd->nvf_flags &= ~NVF_F_FLUSHING;
        if (rval == DDI_FAILURE) {
                if (NVF_IS_READONLY(nvfd)) {
                        rval = DDI_SUCCESS;
                        nvfd->nvf_flags &= ~(NVF_F_ERROR | NVF_F_DIRTY);
                } else if ((nvfd->nvf_flags & NVF_F_ERROR) == 0) {
                        cmn_err(CE_CONT,
                            "%s: update failed\n", nvfd->nvf_cache_path);
                        nvfd->nvf_flags |= NVF_F_ERROR | NVF_F_DIRTY;
                }
        } else {
                if (nvfd->nvf_flags & NVF_F_CREATE_MSG) {
                        cmn_err(CE_CONT,
                            "!Creating %s\n", nvfd->nvf_cache_path);
                        nvfd->nvf_flags &= ~NVF_F_CREATE_MSG;
                }
                if (nvfd->nvf_flags & NVF_F_REBUILD_MSG) {
                        cmn_err(CE_CONT,
                            "!Rebuilding %s\n", nvfd->nvf_cache_path);
                        nvfd->nvf_flags &= ~NVF_F_REBUILD_MSG;
                }
                if (nvfd->nvf_flags & NVF_F_ERROR) {
                        cmn_err(CE_CONT,
                            "%s: update now ok\n", nvfd->nvf_cache_path);
                        nvfd->nvf_flags &= ~NVF_F_ERROR;
                }
                /*
                 * The file may need to be flushed again if the cached
                 * data was touched while writing the earlier contents.
                 */
                if (NVF_IS_DIRTY(nvfd))
                        rval = DDI_FAILURE;
        }

        rw_exit(&nvfd->nvf_lock);
        return (rval);
}


static void
nvpflush_daemon(void)
{
        callb_cpr_t cprinfo;
        nvfd_t *nvfdp, *nextfdp;
        clock_t clk;
        int rval;
        int want_wakeup;
        int is_now_clean;

        ASSERT(modrootloaded);

        nvpflush_thread = curthread;
        NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: init\n"));

        CALLB_CPR_INIT(&cprinfo, &nvpflush_lock, callb_generic_cpr, "nvp");
        mutex_enter(&nvpflush_lock);
        for (;;) {
                CALLB_CPR_SAFE_BEGIN(&cprinfo);
                while (do_nvpflush == 0) {
                        clk = cv_reltimedwait(&nvpflush_cv, &nvpflush_lock,
                            (nvpdaemon_idle_time * TICKS_PER_SECOND),
                            TR_CLOCK_TICK);
                        if ((clk == -1 && do_nvpflush == 0 &&
                            nvpflush_timer_busy == 0) || sys_shutdown) {
                                /*
                                 * Note that CALLB_CPR_EXIT calls mutex_exit()
                                 * on the lock passed in to CALLB_CPR_INIT,
                                 * so the lock must be held when invoking it.
                                 */
                                CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);
                                NVPDAEMON_DEBUG((CE_CONT, "nvpdaemon: exit\n"));
                                ASSERT(mutex_owned(&nvpflush_lock));
                                nvpflush_thr_id = NULL;
                                nvpflush_daemon_active = 0;
                                CALLB_CPR_EXIT(&cprinfo);
                                thread_exit();
                        }
                }
                CALLB_CPR_SAFE_END(&cprinfo, &nvpflush_lock);

                nvpbusy = 1;
                want_wakeup = 0;
                do_nvpflush = 0;
                mutex_exit(&nvpflush_lock);

                /*
                 * Try flushing what's dirty, reschedule if there's
                 * a failure or data gets marked as dirty again.
                 * First move each file marked dirty to the dirty
                 * list to avoid locking the list across the write.
                 */
                mutex_enter(&nvf_cache_mutex);
                for (nvfdp = list_head(&nvf_cache_files);
                    nvfdp; nvfdp = nextfdp) {
                        nextfdp = list_next(&nvf_cache_files, nvfdp);
                        rw_enter(&nvfdp->nvf_lock, RW_READER);
                        if (NVF_IS_DIRTY(nvfdp)) {
                                list_remove(&nvf_cache_files, nvfdp);
                                list_insert_tail(&nvf_dirty_files, nvfdp);
                                rw_exit(&nvfdp->nvf_lock);
                        } else {
                                NVPDAEMON_DEBUG((CE_CONT,
                                    "nvpdaemon: not dirty %s\n",
                                    nvfdp->nvf_cache_path));
                                rw_exit(&nvfdp->nvf_lock);
                        }
                }
                mutex_exit(&nvf_cache_mutex);

                /*
                 * Now go through the dirty list
                 */
                for (nvfdp = list_head(&nvf_dirty_files);
                    nvfdp; nvfdp = nextfdp) {
                        nextfdp = list_next(&nvf_dirty_files, nvfdp);

                        is_now_clean = 0;
                        rw_enter(&nvfdp->nvf_lock, RW_READER);
                        if (NVF_IS_DIRTY(nvfdp)) {
                                NVPDAEMON_DEBUG((CE_CONT,
                                    "nvpdaemon: flush %s\n",
                                    nvfdp->nvf_cache_path));
                                rw_exit(&nvfdp->nvf_lock);
                                rval = nvpflush_one(nvfdp);
                                rw_enter(&nvfdp->nvf_lock, RW_READER);
                                if (rval != DDI_SUCCESS ||
                                    NVF_IS_DIRTY(nvfdp)) {
                                        rw_exit(&nvfdp->nvf_lock);
                                        NVPDAEMON_DEBUG((CE_CONT,
                                            "nvpdaemon: %s dirty again\n",
                                            nvfdp->nvf_cache_path));
                                        want_wakeup = 1;
                                } else {
                                        rw_exit(&nvfdp->nvf_lock);
                                        nvf_write_is_complete(nvfdp);
                                        is_now_clean = 1;
                                }
                        } else {
                                NVPDAEMON_DEBUG((CE_CONT,
                                    "nvpdaemon: not dirty %s\n",
                                    nvfdp->nvf_cache_path));
                                rw_exit(&nvfdp->nvf_lock);
                                is_now_clean = 1;
                        }

                        if (is_now_clean) {
                                mutex_enter(&nvf_cache_mutex);
                                list_remove(&nvf_dirty_files, nvfdp);
                                list_insert_tail(&nvf_cache_files,
                                    nvfdp);
                                mutex_exit(&nvf_cache_mutex);
                        }
                }

                if (want_wakeup)
                        nvf_wake_daemon();

                mutex_enter(&nvpflush_lock);
                nvpbusy = 0;
        }
}