root/cddl/usr.sbin/zfsd/case_file.cc
/*-
 * Copyright (c) 2011, 2012, 2013, 2014, 2016 Spectra Logic Corporation
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions, and the following disclaimer,
 *    without modification.
 * 2. Redistributions in binary form must reproduce at minimum a disclaimer
 *    substantially similar to the "NO WARRANTY" disclaimer below
 *    ("Disclaimer") and any redistribution must be conditioned upon
 *    including a substantially similar Disclaimer requirement for further
 *    binary redistribution.
 *
 * NO WARRANTY
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGES.
 *
 * Authors: Justin T. Gibbs     (Spectra Logic Corporation)
 */

/**
 * \file case_file.cc
 *
 * We keep case files for any leaf vdev that is not in the optimal state.
 * However, we only serialize to disk those events that need to be preserved
 * across reboots.  For now, this is just a log of soft errors which we
 * accumulate in order to mark a device as degraded.
 */
#include <sys/cdefs.h>
#include <sys/byteorder.h>
#include <sys/time.h>

#include <sys/fs/zfs.h>

#include <dirent.h>
#include <fcntl.h>
#include <iomanip>
#include <fstream>
#include <functional>
#include <sstream>
#include <syslog.h>
#include <unistd.h>

#include <libzutil.h>
#include <libzfs.h>

#include <list>
#include <map>
#include <string>

#include <devdctl/guid.h>
#include <devdctl/event.h>
#include <devdctl/event_factory.h>
#include <devdctl/exception.h>
#include <devdctl/consumer.h>

#include "callout.h"
#include "vdev_iterator.h"
#include "zfsd_event.h"
#include "case_file.h"
#include "vdev.h"
#include "zfsd.h"
#include "zfsd_exception.h"
#include "zpool_list.h"
/*============================ Namespace Control =============================*/
using std::hex;
using std::ifstream;
using std::stringstream;
using std::setfill;
using std::setw;

using DevdCtl::Event;
using DevdCtl::EventFactory;
using DevdCtl::EventList;
using DevdCtl::Guid;
using DevdCtl::ParseException;

/*--------------------------------- CaseFile ---------------------------------*/
//- CaseFile Static Data -------------------------------------------------------

CaseFileList  CaseFile::s_activeCases;
const string  CaseFile::s_caseFilePath = "/var/db/zfsd/cases";

//- CaseFile Static Public Methods ---------------------------------------------
CaseFile *
CaseFile::Find(Guid poolGUID, Guid vdevGUID)
{
        for (CaseFileList::iterator curCase = s_activeCases.begin();
             curCase != s_activeCases.end(); curCase++) {

                if (((*curCase)->PoolGUID() != poolGUID
                  && Guid::InvalidGuid() != poolGUID)
                 || (*curCase)->VdevGUID() != vdevGUID)
                        continue;

                /*
                 * We only carry one active case per-vdev.
                 */
                return (*curCase);
        }
        return (NULL);
}

void
CaseFile::Find(Guid poolGUID, Guid vdevGUID, CaseFileList &cases)
{
        for (CaseFileList::iterator curCase = s_activeCases.begin();
            curCase != s_activeCases.end(); curCase++) {
                if (((*curCase)->PoolGUID() != poolGUID &&
                    Guid::InvalidGuid() != poolGUID) ||
                    (*curCase)->VdevGUID() != vdevGUID)
                        continue;

                /*
                 * We can have multiple cases for spare vdevs
                 */
                cases.push_back(*curCase);
                if (!(*curCase)->IsSpare()) {
                        return;
                }
        }
}

CaseFile *
CaseFile::Find(const string &physPath)
{
        CaseFile *result = NULL;

        for (CaseFileList::iterator curCase = s_activeCases.begin();
             curCase != s_activeCases.end(); curCase++) {

                if ((*curCase)->PhysicalPath() != physPath)
                        continue;

                if (result != NULL) {
                        syslog(LOG_WARNING, "Multiple casefiles found for "
                            "physical path %s.  "
                            "This is most likely a bug in zfsd",
                            physPath.c_str());
                }
                result = *curCase;
        }
        return (result);
}


void
CaseFile::ReEvaluateByGuid(Guid poolGUID, const ZfsEvent &event)
{
        CaseFileList::iterator casefile;
        for (casefile = s_activeCases.begin(); casefile != s_activeCases.end();){
                CaseFileList::iterator next = casefile;
                next++;
                if (poolGUID == (*casefile)->PoolGUID())
                        (*casefile)->ReEvaluate(event);
                casefile = next;
        }
}

CaseFile &
CaseFile::Create(Vdev &vdev)
{
        CaseFile *activeCase;

        activeCase = Find(vdev.PoolGUID(), vdev.GUID());
        if (activeCase == NULL)
                activeCase = new CaseFile(vdev);

        return (*activeCase);
}

void
CaseFile::DeSerialize()
{
        struct dirent **caseFiles;

        int numCaseFiles(scandir(s_caseFilePath.c_str(), &caseFiles,
                         DeSerializeSelector, /*compar*/NULL));

        if (numCaseFiles == -1)
                return;
        if (numCaseFiles == 0) {
                free(caseFiles);
                return;
        }

        for (int i = 0; i < numCaseFiles; i++) {

                DeSerializeFile(caseFiles[i]->d_name);
                free(caseFiles[i]);
        }
        free(caseFiles);
}

bool
CaseFile::Empty()
{
        return (s_activeCases.empty());
}

void
CaseFile::LogAll()
{
        for (CaseFileList::iterator curCase = s_activeCases.begin();
             curCase != s_activeCases.end(); curCase++)
                (*curCase)->Log();
}

void
CaseFile::PurgeAll()
{
        /*
         * Serialize casefiles before deleting them so that they can be reread
         * and revalidated during BuildCaseFiles.
         * CaseFiles remove themselves from this list on destruction.
         */
        while (s_activeCases.size() != 0) {
                CaseFile *casefile = s_activeCases.front();
                casefile->Serialize();
                delete casefile;
        }

}

int
CaseFile::IsSpare()
{
        return (m_is_spare);
}

//- CaseFile Public Methods ----------------------------------------------------
bool
CaseFile::RefreshVdevState()
{
        ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
        zpool_handle_t *casePool(zpl.empty() ? NULL : zpl.front());
        if (casePool == NULL)
                return (false);

        Vdev vd(casePool, CaseVdev(casePool));
        if (vd.DoesNotExist())
                return (false);

        m_vdevState    = vd.State();
        m_vdevPhysPath = vd.PhysicalPath();
        m_vdevName = vd.Name(casePool, false);
        return (true);
}

bool
CaseFile::ReEvaluate(const string &devPath, const string &physPath, Vdev *vdev)
{
        ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
        zpool_handle_t *pool(zpl.empty() ? NULL : zpl.front());
        int flags = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;

        if (pool == NULL || !RefreshVdevState()) {
                /*
                 * The pool or vdev for this case file is no longer
                 * part of the configuration.  This can happen
                 * if we process a device arrival notification
                 * before seeing the ZFS configuration change
                 * event.
                 */
                syslog(LOG_INFO,
                       "CaseFile::ReEvaluate(%s,%s) Pool/Vdev unconfigured.  "
                       "Closing\n",
                       PoolGUIDString().c_str(),
                       VdevGUIDString().c_str());
                Close();

                /*
                 * Since this event was not used to close this
                 * case, do not report it as consumed.
                 */
                return (/*consumed*/false);
        }

        if (VdevState() > VDEV_STATE_FAULTED) {
                /*
                 * For now, newly discovered devices only help for
                 * devices that are missing.  In the future, we might
                 * use a newly inserted spare to replace a degraded
                 * or faulted device.
                 */
                syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
                    PoolGUIDString().c_str(), VdevGUIDString().c_str());
                return (/*consumed*/false);
        }
        if (VdevState() == VDEV_STATE_OFFLINE) {
                /*
                 * OFFLINE is an administrative decision.  No need for zfsd to
                 * do anything.
                 */
                syslog(LOG_INFO, "CaseFile::ReEvaluate(%s,%s): Pool/Vdev ignored",
                    PoolGUIDString().c_str(), VdevGUIDString().c_str());
                return (/*consumed*/false);
        }

        if (vdev != NULL
         && ( vdev->PoolGUID() == m_poolGUID
           || vdev->PoolGUID() == Guid::InvalidGuid())
         && vdev->GUID() == m_vdevGUID) {

                if (IsSpare())
                        flags |= ZFS_ONLINE_SPARE;
                if (zpool_vdev_online(pool, vdev->GUIDString().c_str(),
                    flags, &m_vdevState) != 0) {
                        syslog(LOG_ERR,
                            "Failed to online vdev(%s/%s:%s): %s: %s\n",
                            zpool_get_name(pool), vdev->GUIDString().c_str(),
                            devPath.c_str(), libzfs_error_action(g_zfsHandle),
                            libzfs_error_description(g_zfsHandle));
                        return (/*consumed*/false);
                }

                syslog(LOG_INFO, "Onlined vdev(%s/%s:%s).  State now %s.\n",
                       zpool_get_name(pool), vdev->GUIDString().c_str(),
                       devPath.c_str(),
                       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));

                /*
                 * Check the vdev state post the online action to see
                 * if we can retire this case.
                 */
                CloseIfSolved();

                return (/*consumed*/true);
        }

        /*
         * If the auto-replace policy is enabled, and we have physical
         * path information, try a physical path replacement.
         */
        if (zpool_get_prop_int(pool, ZPOOL_PROP_AUTOREPLACE, NULL) == 0) {
                syslog(LOG_INFO,
                       "CaseFile(%s:%s:%s): AutoReplace not set.  "
                       "Ignoring device insertion.\n",
                       PoolGUIDString().c_str(),
                       VdevGUIDString().c_str(),
                       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
                return (/*consumed*/false);
        }

        if (PhysicalPath().empty()) {
                syslog(LOG_INFO,
                       "CaseFile(%s:%s:%s): No physical path information.  "
                       "Ignoring device insertion.\n",
                       PoolGUIDString().c_str(),
                       VdevGUIDString().c_str(),
                       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
                return (/*consumed*/false);
        }

        if (physPath != PhysicalPath()) {
                syslog(LOG_INFO,
                       "CaseFile(%s:%s:%s): Physical path mismatch.  "
                       "Ignoring device insertion.\n",
                       PoolGUIDString().c_str(),
                       VdevGUIDString().c_str(),
                       zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
                return (/*consumed*/false);
        }

        /* Write a label on the newly inserted disk. */
        if (zpool_label_disk(g_zfsHandle, pool, devPath.c_str()) != 0) {
                syslog(LOG_ERR,
                       "Replace vdev(%s/%s) by physical path (label): %s: %s\n",
                       zpool_get_name(pool), VdevGUIDString().c_str(),
                       libzfs_error_action(g_zfsHandle),
                       libzfs_error_description(g_zfsHandle));
                return (/*consumed*/false);
        }

        syslog(LOG_INFO, "CaseFile::ReEvaluate(%s/%s): Replacing with %s",
            PoolGUIDString().c_str(), VdevGUIDString().c_str(),
            devPath.c_str());
        return (Replace(VDEV_TYPE_DISK, devPath.c_str(), /*isspare*/false));
}

bool
CaseFile::ReEvaluate(const ZfsEvent &event)
{
        bool consumed(false);

        if (event.Value("type") == "sysevent.fs.zfs.vdev_remove") {
                /*
                 * The Vdev we represent has been removed from the
                 * configuration.  This case is no longer of value.
                 */
                Close();

                return (/*consumed*/true);
        } else if (event.Value("type") == "sysevent.fs.zfs.pool_destroy") {
                /* This Pool has been destroyed.  Discard the case */
                Close();

                return (/*consumed*/true);
        } else if (event.Value("type") == "sysevent.fs.zfs.config_sync") {
                RefreshVdevState();
                if (VdevState() < VDEV_STATE_HEALTHY &&
                    VdevState() != VDEV_STATE_OFFLINE)
                        consumed = ActivateSpare();
        }


        if (event.Value("class") == "resource.fs.zfs.removed") {
                bool spare_activated;

                if (!RefreshVdevState()) {
                        /*
                         * The pool or vdev for this case file is no longer
                         * part of the configuration.  This can happen
                         * if we process a device arrival notification
                         * before seeing the ZFS configuration change
                         * event.
                         */
                        syslog(LOG_INFO,
                               "CaseFile::ReEvaluate(%s,%s) Pool/Vdev "
                               "unconfigured.  Closing\n",
                               PoolGUIDString().c_str(),
                               VdevGUIDString().c_str());
                        /*
                         * Close the case now so we won't waste cycles in the
                         * system rescan
                         */
                        Close();

                        /*
                         * Since this event was not used to close this
                         * case, do not report it as consumed.
                         */
                        return (/*consumed*/false);
                }

                /*
                 * Discard any tentative I/O error events for
                 * this case.  They were most likely caused by the
                 * hot-unplug of this device.
                 */
                PurgeTentativeEvents();

                /* Try to activate spares if they are available */
                spare_activated = ActivateSpare();

                /*
                 * Rescan the drives in the system to see if a recent
                 * drive arrival can be used to solve this case.
                 */
                ZfsDaemon::RequestSystemRescan();

                /*
                 * Consume the event if we successfully activated a spare.
                 * Otherwise, leave it in the unconsumed events list so that the
                 * future addition of a spare to this pool might be able to
                 * close the case
                 */
                consumed = spare_activated;
        } else if (event.Value("class") == "resource.fs.zfs.statechange") {
                RefreshVdevState();
                /*
                 * If this vdev is DEGRADED, FAULTED, or UNAVAIL, try to
                 * activate a hotspare.  Otherwise, ignore the event
                 */
                if (VdevState() == VDEV_STATE_FAULTED ||
                    VdevState() == VDEV_STATE_DEGRADED ||
                    VdevState() == VDEV_STATE_CANT_OPEN)
                        (void) ActivateSpare();
                consumed = true;
        }
        else if (event.Value("class") == "ereport.fs.zfs.io" ||
                 event.Value("class") == "ereport.fs.zfs.checksum" ||
                 event.Value("class") == "ereport.fs.zfs.delay") {

                m_tentativeEvents.push_front(event.DeepCopy());
                RegisterCallout(event);
                consumed = true;
        }

        bool closed(CloseIfSolved());

        return (consumed || closed);
}

/* Find a Vdev containing the vdev with the given GUID */
static nvlist_t*
find_parent(nvlist_t *pool_config, nvlist_t *config, DevdCtl::Guid child_guid)
{
        nvlist_t **vdevChildren;
        int        error;
        unsigned   ch, numChildren;

        error = nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_CHILDREN,
                                           &vdevChildren, &numChildren);

        if (error != 0 || numChildren == 0)
                return (NULL);

        for (ch = 0; ch < numChildren; ch++) {
                nvlist *result;
                Vdev vdev(pool_config, vdevChildren[ch]);

                if (vdev.GUID() == child_guid)
                        return (config);

                result = find_parent(pool_config, vdevChildren[ch], child_guid);
                if (result != NULL)
                        return (result);
        }

        return (NULL);
}

bool
CaseFile::ActivateSpare() {
        nvlist_t        *config, *nvroot, *parent_config;
        nvlist_t       **spares;
        const char      *devPath, *poolname, *vdev_type;
        u_int            nspares, i;
        int              error;

        ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
        zpool_handle_t  *zhp(zpl.empty() ? NULL : zpl.front());
        if (zhp == NULL) {
                syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
                       "for pool_guid %" PRIu64".", (uint64_t)m_poolGUID);
                return (false);
        }
        poolname = zpool_get_name(zhp);
        config = zpool_get_config(zhp, NULL);
        if (config == NULL) {
                syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find pool "
                       "config for pool %s", poolname);
                return (false);
        }
        error = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot);
        if (error != 0){
                syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not find vdev "
                       "tree for pool %s", poolname);
                return (false);
        }

        parent_config = find_parent(config, nvroot, m_vdevGUID);
        if (parent_config != NULL) {
                const char *parent_type;

                /* 
                 * Don't activate spares for members of a "replacing" vdev.
                 * They're already dealt with.  Sparing them will just drag out
                 * the resilver process.
                 */
                error = nvlist_lookup_string(parent_config,
                    ZPOOL_CONFIG_TYPE, &parent_type);
                if (error == 0 && strcmp(parent_type, VDEV_TYPE_REPLACING) == 0)
                        return (false);
        }

        nspares = 0;
        nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
                                   &nspares);
        if (nspares == 0) {
                /* The pool has no spares configured */
                syslog(LOG_INFO, "CaseFile::ActivateSpare: "
                       "No spares available for pool %s", poolname);
                return (false);
        }
        for (i = 0; i < nspares; i++) {
                uint64_t    *nvlist_array;
                vdev_stat_t *vs;
                uint_t       nstats;

                if (nvlist_lookup_uint64_array(spares[i],
                    ZPOOL_CONFIG_VDEV_STATS, &nvlist_array, &nstats) != 0) {
                        syslog(LOG_ERR, "CaseFile::ActivateSpare: Could not "
                               "find vdev stats for pool %s, spare %d",
                               poolname, i);
                        return (false);
                }
                vs = reinterpret_cast<vdev_stat_t *>(nvlist_array);

                if ((vs->vs_aux != VDEV_AUX_SPARED)
                 && (vs->vs_state == VDEV_STATE_HEALTHY)) {
                        /* We found a usable spare */
                        break;
                }
        }

        if (i == nspares) {
                /* No available spares were found */
                return (false);
        }

        error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_PATH, &devPath);
        if (error != 0) {
                syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
                       "the path of pool %s, spare %d. Error %d",
                       poolname, i, error);
                return (false);
        }

        error = nvlist_lookup_string(spares[i], ZPOOL_CONFIG_TYPE, &vdev_type);
        if (error != 0) {
                syslog(LOG_ERR, "CaseFile::ActivateSpare: Cannot determine "
                       "the vdev type of pool %s, spare %d. Error %d",
                       poolname, i, error);
                return (false);
        }

        return (Replace(vdev_type, devPath, /*isspare*/true));
}

/* Does the argument event refer to a checksum error? */
static bool
IsChecksumEvent(const Event* const event)
{
        return ("ereport.fs.zfs.checksum" == event->Value("type"));
}

/* Does the argument event refer to an IO error? */
static bool
IsIOEvent(const Event* const event)
{
        return ("ereport.fs.zfs.io" == event->Value("type"));
}

/* Does the argument event refer to an IO delay? */
static bool
IsDelayEvent(const Event* const event)
{
        return ("ereport.fs.zfs.delay" == event->Value("type"));
}

void
CaseFile::RegisterCallout(const Event &event)
{
        timeval now, countdown, elapsed, timestamp, zero, remaining;
        /**
         * The time ZFSD waits before promoting a tentative event
         * into a permanent event.
         */
        int sec = -1;
        if (IsChecksumEvent(&event))
                sec = CaseFile::GetVdevProp(VDEV_PROP_CHECKSUM_T);
        else if (IsIOEvent(&event))
                sec = CaseFile::GetVdevProp(VDEV_PROP_IO_T);
        else if (IsDelayEvent(&event))
                sec = CaseFile::GetVdevProp(VDEV_PROP_SLOW_IO_T);

        if (sec == -1)
                sec = 60; /* default */

        timeval removeGracePeriod = {
            sec, /*sec*/
            0 /*usec*/
        };

        gettimeofday(&now, 0);
        timestamp = event.GetTimestamp();
        timersub(&now, &timestamp, &elapsed);
        timersub(&removeGracePeriod, &elapsed, &countdown);
        /*
         * If countdown is <= zero, Reset the timer to the
         * smallest positive time value instead
         */
        timerclear(&zero);
        if (timercmp(&countdown, &zero, <=)) {
                timerclear(&countdown);
                countdown.tv_usec = 1;
        }

        remaining = m_tentativeTimer.TimeRemaining();

        if (!m_tentativeTimer.IsPending()
         || timercmp(&countdown, &remaining, <))
                m_tentativeTimer.Reset(countdown, OnGracePeriodEnded, this);
}


bool
CaseFile::CloseIfSolved()
{
        if (m_events.empty()
         && m_tentativeEvents.empty()) {

                /*
                 * We currently do not track or take actions on
                 * devices in the degraded or faulted state.
                 * Once we have support for spare pools, we'll
                 * retain these cases so that any spares added in
                 * the future can be applied to them.
                 */
                switch (VdevState()) {
                case VDEV_STATE_HEALTHY:
                        /* No need to keep cases for healthy vdevs */
                case VDEV_STATE_OFFLINE:
                        /*
                         * Offline is a deliberate administrative action.  zfsd
                         * doesn't need to do anything for this state.
                         */
                        Close();
                        return (true);
                case VDEV_STATE_REMOVED:
                case VDEV_STATE_CANT_OPEN:
                        /*
                         * Keep open.  We may solve it with a newly inserted
                         * device.
                         */
                case VDEV_STATE_FAULTED:
                case VDEV_STATE_DEGRADED:
                        /*
                         * Keep open.  We may solve it with the future
                         * addition of a spare to the pool
                         */
                case VDEV_STATE_UNKNOWN:
                case VDEV_STATE_CLOSED:
                        /*
                         * Keep open?  This may not be the correct behavior,
                         * but it's what we've always done
                         */
                        ;
                }

                /*
                 * Re-serialize the case in order to remove any
                 * previous event data.
                 */
                Serialize();
        }

        return (false);
}

void
CaseFile::Log()
{
        syslog(LOG_INFO, "CaseFile(%s,%s,%s)\n", PoolGUIDString().c_str(),
               VdevGUIDString().c_str(), PhysicalPath().c_str());
        syslog(LOG_INFO, "\tVdev State = %s\n",
               zpool_state_to_name(VdevState(), VDEV_AUX_NONE));
        if (m_tentativeEvents.size() != 0) {
                syslog(LOG_INFO, "\t=== Tentative Events ===\n");
                for (EventList::iterator event(m_tentativeEvents.begin());
                     event != m_tentativeEvents.end(); event++)
                        (*event)->Log(LOG_INFO);
        }
        if (m_events.size() != 0) {
                syslog(LOG_INFO, "\t=== Events ===\n");
                for (EventList::iterator event(m_events.begin());
                     event != m_events.end(); event++)
                        (*event)->Log(LOG_INFO);
        }
}

//- CaseFile Static Protected Methods ------------------------------------------
void
CaseFile::OnGracePeriodEnded(void *arg)
{
        CaseFile &casefile(*static_cast<CaseFile *>(arg));

        casefile.OnGracePeriodEnded();
}

int
CaseFile::DeSerializeSelector(const struct dirent *dirEntry)
{
        uint64_t poolGUID;
        uint64_t vdevGUID;

        if (dirEntry->d_type == DT_REG
         && sscanf(dirEntry->d_name, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
                   &poolGUID, &vdevGUID) == 2)
                return (1);
        return (0);
}

void
CaseFile::DeSerializeFile(const char *fileName)
{
        string    fullName(s_caseFilePath + '/' + fileName);
        CaseFile *existingCaseFile(NULL);
        CaseFile *caseFile(NULL);

        try {
                uint64_t poolGUID;
                uint64_t vdevGUID;
                nvlist_t *vdevConf;

                if (sscanf(fileName, "pool_%" PRIu64 "_vdev_%" PRIu64 ".case",
                       &poolGUID, &vdevGUID) != 2) {
                        throw ZfsdException("CaseFile::DeSerialize: "
                            "Unintelligible CaseFile filename %s.\n", fileName);
                }
                existingCaseFile = Find(Guid(poolGUID), Guid(vdevGUID));
                if (existingCaseFile != NULL) {
                        /*
                         * If the vdev is already degraded or faulted,
                         * there's no point in keeping the state around
                         * that we use to put a drive into the degraded
                         * state.  However, if the vdev is simply missing,
                         * preserve the case data in the hopes that it will
                         * return.
                         */
                        caseFile = existingCaseFile;
                        vdev_state curState(caseFile->VdevState());
                        if (curState > VDEV_STATE_CANT_OPEN
                         && curState < VDEV_STATE_HEALTHY) {
                                unlink(fileName);
                                return;
                        }
                } else {
                        ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
                        if (zpl.empty()
                         || (vdevConf = VdevIterator(zpl.front())
                                                    .Find(vdevGUID)) == NULL) {
                                /*
                                 * Either the pool no longer exists
                                 * or this vdev is no longer a member of
                                 * the pool.
                                 */
                                unlink(fullName.c_str());
                                return;
                        }

                        /*
                         * Any vdev we find that does not have a case file
                         * must be in the healthy state and thus worthy of
                         * continued SERD data tracking.
                         */
                        caseFile = new CaseFile(Vdev(zpl.front(), vdevConf));
                }

                ifstream caseStream(fullName.c_str());
                if (!caseStream)
                        throw ZfsdException("CaseFile::DeSerialize: Unable to "
                                            "read %s.\n", fileName);

                caseFile->DeSerialize(caseStream);
        } catch (const ParseException &exp) {

                exp.Log();
                if (caseFile != existingCaseFile)
                        delete caseFile;

                /*
                 * Since we can't parse the file, unlink it so we don't
                 * trip over it again.
                 */
                unlink(fileName);
        } catch (const ZfsdException &zfsException) {

                zfsException.Log();
                if (caseFile != existingCaseFile)
                        delete caseFile;
        }
}

//- CaseFile Protected Methods -------------------------------------------------
CaseFile::CaseFile(const Vdev &vdev)
 : m_poolGUID(vdev.PoolGUID()),
   m_vdevGUID(vdev.GUID()),
   m_vdevState(vdev.State()),
   m_vdevPhysPath(vdev.PhysicalPath()),
   m_is_spare(vdev.IsSpare())
{
        stringstream guidString;

        guidString << m_vdevGUID;
        m_vdevGUIDString = guidString.str();
        guidString.str("");
        guidString << m_poolGUID;
        m_poolGUIDString = guidString.str();

        ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
        zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
        m_vdevName = vdev.Name(zhp, false);

        s_activeCases.push_back(this);

        syslog(LOG_INFO, "Creating new CaseFile:\n");
        Log();
}

CaseFile::~CaseFile()
{
        PurgeEvents();
        PurgeTentativeEvents();
        m_tentativeTimer.Stop();
        s_activeCases.remove(this);
}

void
CaseFile::PurgeEvents()
{
        for (EventList::iterator event(m_events.begin());
             event != m_events.end(); event++)
                delete *event;

        m_events.clear();
}

void
CaseFile::PurgeTentativeEvents()
{
        for (EventList::iterator event(m_tentativeEvents.begin());
             event != m_tentativeEvents.end(); event++)
                delete *event;

        m_tentativeEvents.clear();
}

void
CaseFile::SerializeEvList(const EventList events, int fd,
                const char* prefix) const
{
        if (events.empty())
                return;
        for (EventList::const_iterator curEvent = events.begin();
             curEvent != events.end(); curEvent++) {
                const string &eventString((*curEvent)->GetEventString());

                // TODO: replace many write(2) calls with a single writev(2)
                if (prefix)
                        write(fd, prefix, strlen(prefix));
                write(fd, eventString.c_str(), eventString.length());
        }
}

void
CaseFile::Serialize()
{
        stringstream saveFile;

        saveFile << setfill('0')
                 << s_caseFilePath << "/"
                 << "pool_" << PoolGUIDString()
                 << "_vdev_" << VdevGUIDString()
                 << ".case";

        if (m_events.empty() && m_tentativeEvents.empty()) {
                unlink(saveFile.str().c_str());
                return;
        }

        int fd(open(saveFile.str().c_str(), O_CREAT|O_TRUNC|O_WRONLY, 0644));
        if (fd == -1) {
                syslog(LOG_ERR, "CaseFile::Serialize: Unable to open %s.\n",
                       saveFile.str().c_str());
                return;
        }
        SerializeEvList(m_events, fd);
        SerializeEvList(m_tentativeEvents, fd, "tentative ");
        close(fd);
}

/*
 * XXX: This method assumes that events may not contain embedded newlines.  If
 * ever events can contain embedded newlines, then CaseFile must switch
 * serialization formats
 */
void
CaseFile::DeSerialize(ifstream &caseStream)
{
        string        evString;
        const EventFactory &factory(ZfsDaemon::Get().GetFactory());

        caseStream >> std::noskipws >> std::ws;
        while (caseStream.good()) {
                /*
                 * Outline:
                 * read the beginning of a line and check it for
                 * "tentative".  If found, discard "tentative".
                 * Create a new event
                 * continue
                 */
                EventList* destEvents;
                const string tentFlag("tentative ");
                string line;
                std::stringbuf lineBuf;

                caseStream.get(lineBuf);
                caseStream.ignore();  /*discard the newline character*/
                line = lineBuf.str();
                if (line.compare(0, tentFlag.size(), tentFlag) == 0) {
                        /* Discard "tentative" */
                        line.erase(0, tentFlag.size());
                        destEvents = &m_tentativeEvents;
                } else {
                        destEvents = &m_events;
                }
                Event *event(Event::CreateEvent(factory, line));
                if (event != NULL) {
                        destEvents->push_back(event);
                        RegisterCallout(*event);
                }
        }
}

void
CaseFile::Close()
{
        /*
         * This case is no longer relevant.  Clean up our
         * serialization file, and delete the case.
         */
        syslog(LOG_INFO, "CaseFile(%s,%s) closed - State %s\n",
               PoolGUIDString().c_str(), VdevGUIDString().c_str(),
               zpool_state_to_name(VdevState(), VDEV_AUX_NONE));

        /*
         * Serialization of a Case with no event data, clears the
         * Serialization data for that event.
         */
        PurgeEvents();
        Serialize();

        delete this;
}

void
CaseFile::OnGracePeriodEnded()
{
        bool should_fault, should_degrade;
        ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
        zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());

        m_events.splice(m_events.begin(), m_tentativeEvents);
        should_fault = ShouldFault();
        should_degrade = ShouldDegrade();

        if (should_fault || should_degrade) {
                if (zhp == NULL
                 || (VdevIterator(zhp).Find(m_vdevGUID)) == NULL) {
                        /*
                         * Either the pool no longer exists
                         * or this vdev is no longer a member of
                         * the pool.
                         */
                        Close();
                        return;
                }

        }

        /* A fault condition has priority over a degrade condition */
        if (ShouldFault()) {
                /* Fault the vdev and close the case. */
                if (zpool_vdev_fault(zhp, (uint64_t)m_vdevGUID,
                                       VDEV_AUX_ERR_EXCEEDED) == 0) {
                        syslog(LOG_INFO, "Faulting vdev(%s/%s)",
                               PoolGUIDString().c_str(),
                               VdevGUIDString().c_str());
                        Close();
                        return;
                }
                else {
                        syslog(LOG_ERR, "Fault vdev(%s/%s): %s: %s\n",
                               PoolGUIDString().c_str(),
                               VdevGUIDString().c_str(),
                               libzfs_error_action(g_zfsHandle),
                               libzfs_error_description(g_zfsHandle));
                }
        }
        else if (ShouldDegrade()) {
                /* Degrade the vdev and close the case. */
                if (zpool_vdev_degrade(zhp, (uint64_t)m_vdevGUID,
                                       VDEV_AUX_ERR_EXCEEDED) == 0) {
                        syslog(LOG_INFO, "Degrading vdev(%s/%s)",
                               PoolGUIDString().c_str(),
                               VdevGUIDString().c_str());
                        Close();
                        return;
                }
                else {
                        syslog(LOG_ERR, "Degrade vdev(%s/%s): %s: %s\n",
                               PoolGUIDString().c_str(),
                               VdevGUIDString().c_str(),
                               libzfs_error_action(g_zfsHandle),
                               libzfs_error_description(g_zfsHandle));
                }
        }
        Serialize();
}

Vdev
CaseFile::BeingReplacedBy(zpool_handle_t *zhp) {
        Vdev vd(zhp, CaseVdev(zhp));
        std::list<Vdev> children;
        std::list<Vdev>::iterator children_it;

        Vdev parent(vd.Parent());
        Vdev replacing(NonexistentVdev);

        /*
         * To determine whether we are being replaced by another spare that
         * is still working, then make sure that it is currently spared and
         * that the spare is either resilvering or healthy.  If any of these
         * conditions fail, then we are not being replaced by a spare.
         *
         * If the spare is healthy, then the case file should be closed very
         * soon after this check.
         */
        if (parent.DoesNotExist()
         || parent.Name(zhp, /*verbose*/false) != "spare")
                return (NonexistentVdev);

        children = parent.Children();
        children_it = children.begin();
        for (;children_it != children.end(); children_it++) {
                Vdev child = *children_it;

                /* Skip our vdev. */
                if (child.GUID() == VdevGUID())
                        continue;
                /*
                 * Accept the first child that doesn't match our GUID, or
                 * any resilvering/healthy device if one exists.
                 */
                if (replacing.DoesNotExist() || child.IsResilvering()
                 || child.State() == VDEV_STATE_HEALTHY)
                        replacing = child;
        }

        return (replacing);
}

bool
CaseFile::Replace(const char* vdev_type, const char* path, bool isspare) {
        nvlist_t *nvroot, *newvd;
        const char *poolname;
        string oldstr(VdevGUIDString());
        bool retval = true;

        /* Figure out what pool we're working on */
        ZpoolList zpl(ZpoolList::ZpoolByGUID, &m_poolGUID);
        zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());
        if (zhp == NULL) {
                syslog(LOG_ERR, "CaseFile::Replace: could not find pool for "
                       "pool_guid %" PRIu64 ".", (uint64_t)m_poolGUID);
                return (false);
        }
        poolname = zpool_get_name(zhp);
        Vdev vd(zhp, CaseVdev(zhp));
        Vdev replaced(BeingReplacedBy(zhp));

        if (isspare && !vd.IsSpare() && !replaced.DoesNotExist()) {
                /* If we are already being replaced by a working spare, pass. */
                if (replaced.IsResilvering()
                 || replaced.State() == VDEV_STATE_HEALTHY) {
                        syslog(LOG_INFO, "CaseFile::Replace(%s->%s): already "
                            "replaced", VdevGUIDString().c_str(), path);
                        return (/*consumed*/false);
                }
                /*
                 * If we have already been replaced by a spare, but that spare
                 * is broken, we must spare the spare, not the original device.
                 */
                oldstr = replaced.GUIDString();
                syslog(LOG_INFO, "CaseFile::Replace(%s->%s): sparing "
                    "broken spare %s instead", VdevGUIDString().c_str(),
                    path, oldstr.c_str());
        }

        /*
         * Build a root vdev/leaf vdev configuration suitable for
         * zpool_vdev_attach. Only enough data for the kernel to find
         * the device (i.e. type and disk device node path) are needed.
         */
        nvroot = NULL;
        newvd = NULL;

        if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0
         || nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) {
                syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to allocate "
                    "configuration data.", poolname, oldstr.c_str());
                if (nvroot != NULL)
                        nvlist_free(nvroot);
                return (false);
        }
        if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, vdev_type) != 0
         || nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0
         || nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0
         || nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
                                    &newvd, 1) != 0) {
                syslog(LOG_ERR, "Replace vdev(%s/%s): Unable to initialize "
                    "configuration data.", poolname, oldstr.c_str());
                nvlist_free(newvd);
                nvlist_free(nvroot);
                return (true);
        }

        /* Data was copied when added to the root vdev. */
        nvlist_free(newvd);

        retval = (zpool_vdev_attach(zhp, oldstr.c_str(), path, nvroot,
       /*replace*/B_TRUE, /*rebuild*/ B_FALSE) == 0);
        if (retval)
                syslog(LOG_INFO, "Replacing vdev(%s/%s) with %s\n",
                    poolname, oldstr.c_str(), path);
        else
                syslog(LOG_ERR, "Replace vdev(%s/%s): %s: %s\n",
                    poolname, oldstr.c_str(), libzfs_error_action(g_zfsHandle),
                    libzfs_error_description(g_zfsHandle));
        nvlist_free(nvroot);

        return (retval);
}

/* Lookup the vdev prop. Used for checksum, IO, or slow IO props */
int
CaseFile::GetVdevProp(vdev_prop_t vdev_prop) const
{
        char val[ZFS_MAXPROPLEN];
        zprop_source_t srctype;
        DevdCtl::Guid poolGUID = PoolGUID();
        ZpoolList zpl(ZpoolList::ZpoolByGUID, &poolGUID);
        zpool_handle_t *zhp(zpl.empty() ? NULL : zpl.front());

        char *prop_str = (char *) vdev_prop_to_name(vdev_prop);
        if (zhp == NULL || zpool_get_vdev_prop(zhp, m_vdevName.c_str(),
            vdev_prop, prop_str, val, sizeof (val), &srctype, B_FALSE) != 0)
                return (-1);

        /* we'll get "-" from libzfs for a prop that is not set */
        if (zfs_isnumber(val) == B_FALSE)
                return (-1);

        return (atoi(val));
}

bool
CaseFile::ShouldDegrade() const
{
        int checksum_n = GetVdevProp(VDEV_PROP_CHECKSUM_N);
        if (checksum_n == -1)
                checksum_n = DEFAULT_ZFS_DEGRADE_IO_COUNT;
        return (std::count_if(m_events.begin(), m_events.end(),
                              IsChecksumEvent) > checksum_n);
}

bool
CaseFile::ShouldFault() const
{
        bool should_fault_for_io, should_fault_for_delay;
        int io_n = GetVdevProp(VDEV_PROP_IO_N);
        int slow_io_n = GetVdevProp(VDEV_PROP_SLOW_IO_N);

        if (io_n == -1)
                io_n = DEFAULT_ZFS_DEGRADE_IO_COUNT;
        if (slow_io_n == -1)
                slow_io_n = DEFAULT_ZFS_FAULT_SLOW_IO_COUNT;

        should_fault_for_io = std::count_if(m_events.begin(), m_events.end(),
                              IsIOEvent) > io_n;
        should_fault_for_delay = std::count_if(m_events.begin(), m_events.end(),
                              IsDelayEvent) > slow_io_n;

        return (should_fault_for_io || should_fault_for_delay);
}

nvlist_t *
CaseFile::CaseVdev(zpool_handle_t *zhp) const
{
        return (VdevIterator(zhp).Find(VdevGUID()));
}