root/usr/src/cmd/fm/modules/common/disk-monitor/disk_monitor.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
 */

/*
 * Disk Monitor
 */
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <strings.h>
#include <stdarg.h>
#include <errno.h>
#include <signal.h>
#include <unistd.h>
#include <pthread.h>
#include <libnvpair.h>
#include <fm/fmd_api.h>
#include <fm/fmd_fmri.h>
#include <sys/fm/protocol.h>
#include <sys/fm/io/disk.h>
#include <fm/libtopo.h>

#include "disk_monitor.h"
#include "hotplug_mgr.h"
#include "schg_mgr.h"
#include "topo_gather.h"
#include "dm_platform.h"

#define THIS_FMD_MODULE_NAME "disk-monitor"

static enum disk_init_state {
        INIT_STATE_NONE = 0,
        STATE_CHANGE_MGR_INITTED = 2,
        HOTPLUG_MGR_INITTED = 4
} g_init_state = INIT_STATE_NONE;

typedef enum {
        LT_SUSPECT,
        LT_REPAIRED
} fm_list_type_t;

/*
 * Global verbosity flag -- controls chattiness of debug messages and
 * warnings.  Its value is determined by the fmd property "log-level"
 * settable in the DE's .conf file.
 */
log_class_t                     g_verbose = 0;
cfgdata_t                       *config_data = NULL;
fmd_hdl_t                       *g_fm_hdl = NULL;

static const fmd_prop_t         fmd_props[];

static void
diskmon_teardown_all(void)
{
        cleanup_hotplug_manager();
        cleanup_state_change_manager(config_data);
        config_fini();
}

static int
count_disks(diskmon_t *disklistp)
{
        int i = 0;

        while (disklistp != NULL) {
                i++;
                disklistp = disklistp->next;
        }

        return (i);
}

static int
diskmon_init(void)
{
        /*
         * Block the generation of state change events (generated by the
         * hotplug manager thread) here; they will be unblocked after the
         * state change manager thread is ready to accept state changes
         * (shortly after it starts).
         */
        block_state_change_events();

        if (dm_platform_init() != 0)
                goto cleanup;

        if (init_hotplug_manager() != 0)
                goto cleanup;
        else
                g_init_state |= HOTPLUG_MGR_INITTED;

        if (init_state_change_manager(config_data) != 0)
                goto cleanup;
        else
                g_init_state |= STATE_CHANGE_MGR_INITTED;

        return (E_SUCCESS);

cleanup:

        unblock_state_change_events();

        /*
         * The cleanup order here does matter, due to dependencies between the
         * managers.
         */
        if (g_init_state & HOTPLUG_MGR_INITTED)
                cleanup_hotplug_manager();
        if (g_init_state & STATE_CHANGE_MGR_INITTED)
                cleanup_state_change_manager(config_data);
        dm_platform_fini();

        return (E_ERROR);
}

static void
dm_fault_execute_actions(fmd_hdl_t *hdl, diskmon_t *diskp, nvlist_t *nvl)
{
        const char              *action_prop = NULL;
        const char              *action_string;

        /*
         * The predictive failure action is the activation of the fault
         * indicator.
         */
        if (fmd_nvl_class_match(hdl, nvl,
            DISK_ERROR_CLASS "." FM_FAULT_DISK_OVERTEMP))
                action_prop = DISK_PROP_OTEMPACTION;

        if (fmd_nvl_class_match(hdl, nvl,
            DISK_ERROR_CLASS "." FM_FAULT_DISK_TESTFAIL))
                action_prop = DISK_PROP_STFAILACTION;

        if (fmd_nvl_class_match(hdl, nvl,
            DISK_ERROR_CLASS "." FM_FAULT_SSM_WEAROUT))
                action_prop = DISK_PROP_SSMWEAROUTACTION;

        dm_fault_indicator_set(diskp, INDICATOR_ON);

        if (action_prop != NULL &&
            (action_string = dm_prop_lookup(diskp->props, action_prop))
            != NULL) {

                if (dm_platform_indicator_execute(action_string) != 0) {
                        log_warn("Fault action `%s' did not successfully "
                            "complete.\n", action_string);
                }
        }
}

static void
diskmon_agent_repair(fmd_hdl_t *hdl, nvlist_t *nvl, int repair)
{
        char            *uuid = NULL;
        nvlist_t        **nva;
        uint_t          nvc;
        diskmon_t       *diskp;
        nvlist_t        *fmri;
        nvlist_t        *fltnvl;
        int             err = 0;

        err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
        err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
            &nva, &nvc);
        if (err != 0)
                return;

        while (nvc-- != 0) {

                fltnvl = *nva++;

                if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri)
                    != 0)
                        continue;

                if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
                        continue;

                log_msg(MM_MAIN, "Disk %s repaired!\n",
                    diskp->location);

                dm_fault_indicator_set(diskp, INDICATOR_OFF);

                dm_state_change(diskp, HPS_REPAIRED);
        }

        if (repair)
                fmd_case_uuresolved(hdl, uuid);

}

static void
diskmon_agent_suspect(fmd_hdl_t *hdl, nvlist_t *nvl)
{
        char            *uuid = NULL;
        nvlist_t        **nva;
        uint_t          nvc;
        diskmon_t       *diskp;
        nvlist_t        *fmri;
        nvlist_t        *fltnvl;
        int             err = 0;

        err |= nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid);
        err |= nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST,
            &nva, &nvc);
        if (err != 0)
                return;

        while (nvc-- != 0 && !fmd_case_uuclosed(hdl, uuid)) {

                fltnvl = *nva++;

                if (nvlist_lookup_nvlist(fltnvl, FM_FAULT_RESOURCE, &fmri) != 0)
                        continue;

                if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
                        continue;

                /* Execute the actions associated with this fault */
                dm_fault_execute_actions(hdl, diskp,  fltnvl);

                /*
                 * Send a state change event to the state change manager
                 */
                dm_state_change(diskp, HPS_FAULTED);
        }

        if (!fmd_case_uuclosed(hdl, uuid)) {
                /* Case is closed */
                fmd_case_uuclose(hdl, uuid);
        }
}

/*ARGSUSED*/
static void
diskmon_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class)
{
        diskmon_t       *diskp;
        nvlist_t        *fmri;

        if (g_verbose & MM_MAIN)
                nvlist_print(stderr, nvl);

        /*
         * Act on the fault suspect list or repaired list (embedded agent
         * action).
         */
        if (fmd_nvl_class_match(hdl, nvl, FM_LIST_REPAIRED_CLASS)) {

                diskmon_agent_repair(hdl, nvl, 1);
                return;

        } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_UPDATED_CLASS)) {

                diskmon_agent_repair(hdl, nvl, 0);
                return;

        } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_SUSPECT_CLASS)) {

                diskmon_agent_suspect(hdl, nvl);
                return;
        } else if (fmd_nvl_class_match(hdl, nvl, FM_LIST_RESOLVED_CLASS)) {
                return;
        }

        /*
         * If we get any replayed faults, set the diskmon's faulted
         * flag for the appropriate fault, then change the diskmon's state
         * to faulted.
         */
        if (fmd_nvl_class_match(hdl, nvl, DISK_ERROR_CLASS ".*")) {

                if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE,
                    &fmri) != 0)
                        return;

                if ((diskp = dm_fmri_to_diskmon(hdl, fmri)) == NULL)
                        return;

                /* Execute the actions associated with this fault */
                dm_fault_execute_actions(hdl, diskp, nvl);

                /*
                 * If the fault wasn't generated by this module, send a
                 * state change event to the state change manager
                 */
                dm_state_change(diskp, HPS_FAULTED);
                return;
        }
}

static const fmd_hdl_ops_t fmd_ops = {
        diskmon_recv,   /* fmdo_recv */
        NULL,           /* fmdo_timeout */
        NULL,           /* fmdo_close */
        NULL,           /* fmdo_stats */
        NULL,           /* fmdo_gc */
};

static const fmd_prop_t fmd_props[] = {
        { GLOBAL_PROP_LOG_LEVEL, FMD_TYPE_UINT32, "0" },
        { NULL, 0, NULL }
};

static const fmd_hdl_info_t fmd_info = {
        "Disk Monitor",
        DISK_MONITOR_MODULE_VERSION,
        &fmd_ops,
        fmd_props
};

void
_fmd_init(fmd_hdl_t *hdl)
{
        fmd_case_t      *cp;
        int             disk_count;

        g_fm_hdl = hdl;

        if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) {
                return;
        }

        if (config_init()) {
                log_err("Could not initialize configuration!\n");
                fmd_hdl_unregister(hdl);
                return;
        }

        if (config_get(hdl, fmd_props)) {
                config_fini();
                log_err("Could not retrieve configuration from libtopo!\n");
                fmd_hdl_unregister(hdl);
                return;
        }

        /*
         * If there are no disks to monitor, bail out
         */
        if ((disk_count = count_disks(config_data->disk_list)) == 0) {
                config_fini();
                fmd_hdl_unregister(hdl);
                return;
        }

        if (diskmon_init() == E_ERROR) {
                config_fini();
                fmd_hdl_unregister(hdl);
                return;
        }

        log_msg(MM_MAIN, "Monitoring %d disks.\n", disk_count);

        /*
         * Iterate over all active cases.
         * Since we automatically solve all cases, these cases must have
         * had the fault added, but the DE must have been interrupted
         * before they were solved.
         */
        for (cp = fmd_case_next(hdl, NULL);
            cp != NULL; cp = fmd_case_next(hdl, cp)) {

                if (!fmd_case_solved(hdl, cp))
                        fmd_case_solve(hdl, cp);
        }
}

/*ARGSUSED*/
void
_fmd_fini(fmd_hdl_t *hdl)
{
        diskmon_teardown_all();
        g_fm_hdl = NULL;
}