root/usr/src/cmd/svc/startd/restarter.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
 * Copyright 2017 RackTop Systems.
 * Copyright 2019 Joyent, Inc.
 */

/*
 * restarter.c - service manipulation
 *
 * This component manages services whose restarter is svc.startd, the standard
 * restarter.  It translates restarter protocol events from the graph engine
 * into actions on processes, as a delegated restarter would do.
 *
 * The master restarter manages a number of always-running threads:
 *   - restarter event thread: events from the graph engine
 *   - timeout thread: thread to fire queued timeouts
 *   - contract thread: thread to handle contract events
 *   - wait thread: thread to handle wait-based services
 *
 * The other threads are created as-needed:
 *   - per-instance method threads
 *   - per-instance event processing threads
 *
 * The interaction of all threads must result in the following conditions
 * being satisfied (on a per-instance basis):
 *   - restarter events must be processed in order
 *   - method execution must be serialized
 *   - instance delete must be held until outstanding methods are complete
 *   - contract events shouldn't be processed while a method is running
 *   - timeouts should fire even when a method is running
 *
 * Service instances are represented by restarter_inst_t's and are kept in the
 * instance_list list.
 *
 * Service States
 *   The current state of a service instance is kept in
 *   restarter_inst_t->ri_i.i_state.  If transition to a new state could take
 *   some time, then before we effect the transition we set
 *   restarter_inst_t->ri_i.i_next_state to the target state, and afterwards we
 *   rotate i_next_state to i_state and set i_next_state to
 *   RESTARTER_STATE_NONE.  So usually i_next_state is _NONE when ri_lock is not
 *   held.  The exception is when we launch methods, which are done with
 *   a separate thread.  To keep any other threads from grabbing ri_lock before
 *   method_thread() does, we set ri_method_thread to the thread id of the
 *   method thread, and when it is nonzero any thread with a different thread id
 *   waits on ri_method_cv.
 *
 * Method execution is serialized by blocking on ri_method_cv in
 * inst_lookup_by_id() and waiting for a 0 value of ri_method_thread.  This
 * also prevents the instance structure from being deleted until all
 * outstanding operations such as method_thread() have finished.
 *
 * Lock ordering:
 *
 * dgraph_lock [can be held when taking:]
 *   utmpx_lock
 *   dictionary->dict_lock
 *   st->st_load_lock
 *   wait_info_lock
 *   ru->restarter_update_lock
 *     restarter_queue->rpeq_lock
 *   instance_list.ril_lock
 *     inst->ri_lock
 *   st->st_configd_live_lock
 *
 * instance_list.ril_lock
 *   graph_queue->gpeq_lock
 *   gu->gu_lock
 *   st->st_configd_live_lock
 *   dictionary->dict_lock
 *   inst->ri_lock
 *     graph_queue->gpeq_lock
 *     gu->gu_lock
 *     tu->tu_lock
 *     tq->tq_lock
 *     inst->ri_queue_lock
 *       wait_info_lock
 *       bp->cb_lock
 *     utmpx_lock
 *
 * single_user_thread_lock
 *   wait_info_lock
 *   utmpx_lock
 *
 * gu_freeze_lock
 *
 * logbuf_mutex nests inside pretty much everything.
 */

#include <sys/contract/process.h>
#include <sys/ctfs.h>
#include <sys/stat.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/uio.h>
#include <sys/wait.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <libcontract.h>
#include <libcontract_priv.h>
#include <libintl.h>
#include <librestart.h>
#include <librestart_priv.h>
#include <libuutil.h>
#include <limits.h>
#include <poll.h>
#include <port.h>
#include <pthread.h>
#include <stdarg.h>
#include <stdio.h>
#include <strings.h>
#include <unistd.h>

#include "startd.h"
#include "protocol.h"

uu_list_pool_t *contract_list_pool;
static uu_list_pool_t *restarter_instance_pool;
static restarter_instance_list_t instance_list;

static uu_list_pool_t *restarter_queue_pool;

#define WT_SVC_ERR_THROTTLE     1       /* 1 sec delay for erroring wait svc */

/*
 * Function used to reset the restart times for an instance, when
 * an administrative task comes along and essentially makes the times
 * in this array ineffective.
 */
static void
reset_start_times(restarter_inst_t *inst)
{
        inst->ri_start_index = 0;
        bzero(inst->ri_start_time, sizeof (inst->ri_start_time));
}

/*ARGSUSED*/
static int
restarter_instance_compare(const void *lc_arg, const void *rc_arg,
    void *private)
{
        int lc_id = ((const restarter_inst_t *)lc_arg)->ri_id;
        int rc_id = *(int *)rc_arg;

        if (lc_id > rc_id)
                return (1);
        if (lc_id < rc_id)
                return (-1);
        return (0);
}

static restarter_inst_t *
inst_lookup_by_name(const char *name)
{
        int id;

        id = dict_lookup_byname(name);
        if (id == -1)
                return (NULL);

        return (inst_lookup_by_id(id));
}

restarter_inst_t *
inst_lookup_by_id(int id)
{
        restarter_inst_t *inst;

        MUTEX_LOCK(&instance_list.ril_lock);
        inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
        if (inst != NULL)
                MUTEX_LOCK(&inst->ri_lock);
        MUTEX_UNLOCK(&instance_list.ril_lock);

        if (inst != NULL) {
                while (inst->ri_method_thread != 0 &&
                    !pthread_equal(inst->ri_method_thread, pthread_self())) {
                        ++inst->ri_method_waiters;
                        (void) pthread_cond_wait(&inst->ri_method_cv,
                            &inst->ri_lock);
                        assert(inst->ri_method_waiters > 0);
                        --inst->ri_method_waiters;
                }
        }

        return (inst);
}

static restarter_inst_t *
inst_lookup_queue(const char *name)
{
        int id;
        restarter_inst_t *inst;

        id = dict_lookup_byname(name);
        if (id == -1)
                return (NULL);

        MUTEX_LOCK(&instance_list.ril_lock);
        inst = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
        if (inst != NULL)
                MUTEX_LOCK(&inst->ri_queue_lock);
        MUTEX_UNLOCK(&instance_list.ril_lock);

        return (inst);
}

const char *
service_style(int flags)
{
        switch (flags & RINST_STYLE_MASK) {
        case RINST_CONTRACT:    return ("contract");
        case RINST_TRANSIENT:   return ("transient");
        case RINST_WAIT:        return ("wait");

        default:
#ifndef NDEBUG
                uu_warn("%s:%d: Bad flags 0x%x.\n", __FILE__, __LINE__, flags);
#endif
                abort();
                /* NOTREACHED */
        }
}

/*
 * Fails with ECONNABORTED or ECANCELED.
 */
static int
check_contract(restarter_inst_t *inst, boolean_t primary,
    scf_instance_t *scf_inst)
{
        ctid_t *ctidp;
        int fd, r;

        ctidp = primary ? &inst->ri_i.i_primary_ctid :
            &inst->ri_i.i_transient_ctid;

        assert(*ctidp >= 1);

        fd = contract_open(*ctidp, NULL, "status", O_RDONLY);
        if (fd >= 0) {
                r = close(fd);
                assert(r == 0);
                return (0);
        }

        r = restarter_remove_contract(scf_inst, *ctidp, primary ?
            RESTARTER_CONTRACT_PRIMARY : RESTARTER_CONTRACT_TRANSIENT);
        switch (r) {
        case 0:
        case ECONNABORTED:
        case ECANCELED:
                *ctidp = 0;
                return (r);

        case ENOMEM:
                uu_die("Out of memory\n");
                /* NOTREACHED */

        case EPERM:
                uu_die("Insufficient privilege.\n");
                /* NOTREACHED */

        case EACCES:
                uu_die("Repository backend access denied.\n");
                /* NOTREACHED */

        case EROFS:
                log_error(LOG_INFO, "Could not remove unusable contract id %ld "
                    "for %s from repository.\n", *ctidp, inst->ri_i.i_fmri);
                return (0);

        case EINVAL:
        case EBADF:
        default:
                assert(0);
                abort();
                /* NOTREACHED */
        }
}

static int stop_instance(scf_handle_t *, restarter_inst_t *, stop_cause_t);

/*
 * int restarter_insert_inst(scf_handle_t *, char *)
 *   If the inst is already in the restarter list, return its id.  If the inst
 *   is not in the restarter list, initialize a restarter_inst_t, initialize its
 *   states, insert it into the list, and return 0.
 *
 *   Fails with
 *     ENOENT - name is not in the repository
 */
static int
restarter_insert_inst(scf_handle_t *h, const char *name)
{
        int id, r;
        restarter_inst_t *inst;
        uu_list_index_t idx;
        scf_service_t *scf_svc;
        scf_instance_t *scf_inst;
        scf_snapshot_t *snap = NULL;
        scf_propertygroup_t *pg;
        char *svc_name, *inst_name;
        char logfilebuf[PATH_MAX];
        char *c;
        boolean_t do_commit_states;
        restarter_instance_state_t state, next_state;
        protocol_states_t *ps;
        pid_t start_pid;
        restarter_str_t reason = restarter_str_insert_in_graph;

        MUTEX_LOCK(&instance_list.ril_lock);

        /*
         * We don't use inst_lookup_by_name() here because we want the lookup
         * & insert to be atomic.
         */
        id = dict_lookup_byname(name);
        if (id != -1) {
                inst = uu_list_find(instance_list.ril_instance_list, &id, NULL,
                    &idx);
                if (inst != NULL) {
                        MUTEX_UNLOCK(&instance_list.ril_lock);
                        return (0);
                }
        }

        /* Allocate an instance */
        inst = startd_zalloc(sizeof (restarter_inst_t));
        inst->ri_utmpx_prefix = startd_alloc(max_scf_value_size);
        inst->ri_utmpx_prefix[0] = '\0';

        inst->ri_i.i_fmri = startd_alloc(strlen(name) + 1);
        (void) strcpy((char *)inst->ri_i.i_fmri, name);

        inst->ri_queue = startd_list_create(restarter_queue_pool, inst, 0);

        /*
         * id shouldn't be -1 since we use the same dictionary as graph.c, but
         * just in case.
         */
        inst->ri_id = (id != -1 ? id : dict_insert(name));

        special_online_hooks_get(name, &inst->ri_pre_online_hook,
            &inst->ri_post_online_hook, &inst->ri_post_offline_hook);

        scf_svc = safe_scf_service_create(h);
        scf_inst = safe_scf_instance_create(h);
        pg = safe_scf_pg_create(h);
        svc_name = startd_alloc(max_scf_name_size);
        inst_name = startd_alloc(max_scf_name_size);

rep_retry:
        if (snap != NULL)
                scf_snapshot_destroy(snap);
        if (inst->ri_logstem != NULL)
                startd_free(inst->ri_logstem, PATH_MAX);
        if (inst->ri_common_name != NULL)
                free(inst->ri_common_name);
        if (inst->ri_C_common_name != NULL)
                free(inst->ri_C_common_name);
        snap = NULL;
        inst->ri_logstem = NULL;
        inst->ri_common_name = NULL;
        inst->ri_C_common_name = NULL;

        if (scf_handle_decode_fmri(h, name, NULL, scf_svc, scf_inst, NULL,
            NULL, SCF_DECODE_FMRI_EXACT) != 0) {
                switch (scf_error()) {
                case SCF_ERROR_CONNECTION_BROKEN:
                        libscf_handle_rebind(h);
                        goto rep_retry;

                case SCF_ERROR_NOT_FOUND:
                        goto deleted;
                }

                uu_die("Can't decode FMRI %s: %s\n", name,
                    scf_strerror(scf_error()));
        }

        /*
         * If there's no running snapshot, then we execute using the editing
         * snapshot.  Pending snapshots will be taken later.
         */
        snap = libscf_get_running_snapshot(scf_inst);

        if ((scf_service_get_name(scf_svc, svc_name, max_scf_name_size) < 0) ||
            (scf_instance_get_name(scf_inst, inst_name, max_scf_name_size) <
            0)) {
                switch (scf_error()) {
                case SCF_ERROR_NOT_SET:
                        break;

                case SCF_ERROR_CONNECTION_BROKEN:
                        libscf_handle_rebind(h);
                        goto rep_retry;

                default:
                        assert(0);
                        abort();
                }

                goto deleted;
        }

        (void) snprintf(logfilebuf, PATH_MAX, "%s:%s", svc_name, inst_name);
        for (c = logfilebuf; *c != '\0'; c++)
                if (*c == '/')
                        *c = '-';

        inst->ri_logstem = startd_alloc(PATH_MAX);
        (void) snprintf(inst->ri_logstem, PATH_MAX, "%s%s", logfilebuf,
            LOG_SUFFIX);

        /*
         * If the restarter group is missing, use uninit/none.  Otherwise,
         * we're probably being restarted & don't want to mess up the states
         * that are there.
         */
        state = RESTARTER_STATE_UNINIT;
        next_state = RESTARTER_STATE_NONE;

        r = scf_instance_get_pg(scf_inst, SCF_PG_RESTARTER, pg);
        if (r != 0) {
                switch (scf_error()) {
                case SCF_ERROR_CONNECTION_BROKEN:
                        libscf_handle_rebind(h);
                        goto rep_retry;

                case SCF_ERROR_NOT_SET:
                        goto deleted;

                case SCF_ERROR_NOT_FOUND:
                        /*
                         * This shouldn't happen since the graph engine should
                         * have initialized the state to uninitialized/none if
                         * there was no restarter pg.  In case somebody
                         * deleted it, though....
                         */
                        do_commit_states = B_TRUE;
                        break;

                default:
                        assert(0);
                        abort();
                }
        } else {
                r = libscf_read_states(pg, &state, &next_state);
                if (r != 0) {
                        do_commit_states = B_TRUE;
                } else {
                        if (next_state != RESTARTER_STATE_NONE) {
                                /*
                                 * Force next_state to _NONE since we
                                 * don't look for method processes.
                                 */
                                next_state = RESTARTER_STATE_NONE;
                                do_commit_states = B_TRUE;
                        } else {
                                /*
                                 * The reason for transition will depend on
                                 * state.
                                 */
                                if (st->st_initial == 0)
                                        reason = restarter_str_startd_restart;
                                else if (state == RESTARTER_STATE_MAINT)
                                        reason = restarter_str_bad_repo_state;
                                /*
                                 * Inform the restarter of our state without
                                 * changing the STIME in the repository.
                                 */
                                ps = startd_alloc(sizeof (*ps));
                                inst->ri_i.i_state = ps->ps_state = state;
                                inst->ri_i.i_next_state = ps->ps_state_next =
                                    next_state;
                                ps->ps_reason = reason;

                                graph_protocol_send_event(inst->ri_i.i_fmri,
                                    GRAPH_UPDATE_STATE_CHANGE, ps);

                                do_commit_states = B_FALSE;
                        }
                }
        }

        switch (libscf_get_startd_properties(scf_inst, snap, &inst->ri_flags,
            &inst->ri_utmpx_prefix)) {
        case 0:
                break;

        case ECONNABORTED:
                libscf_handle_rebind(h);
                goto rep_retry;

        case ECANCELED:
                goto deleted;

        case ENOENT:
                /*
                 * This is odd, because the graph engine should have required
                 * the general property group.  So we'll just use default
                 * flags in anticipation of the graph engine sending us
                 * REMOVE_INSTANCE when it finds out that the general property
                 * group has been deleted.
                 */
                inst->ri_flags = RINST_CONTRACT;
                break;

        default:
                assert(0);
                abort();
        }

        r = libscf_get_template_values(scf_inst, snap,
            &inst->ri_common_name, &inst->ri_C_common_name);

        /*
         * Copy our names to smaller buffers to reduce our memory footprint.
         */
        if (inst->ri_common_name != NULL) {
                char *tmp = safe_strdup(inst->ri_common_name);
                startd_free(inst->ri_common_name, max_scf_value_size);
                inst->ri_common_name = tmp;
        }

        if (inst->ri_C_common_name != NULL) {
                char *tmp = safe_strdup(inst->ri_C_common_name);
                startd_free(inst->ri_C_common_name, max_scf_value_size);
                inst->ri_C_common_name = tmp;
        }

        switch (r) {
        case 0:
                break;

        case ECONNABORTED:
                libscf_handle_rebind(h);
                goto rep_retry;

        case ECANCELED:
                goto deleted;

        case ECHILD:
        case ENOENT:
                break;

        default:
                assert(0);
                abort();
        }

        switch (libscf_read_method_ids(h, scf_inst, inst->ri_i.i_fmri,
            &inst->ri_i.i_primary_ctid, &inst->ri_i.i_transient_ctid,
            &start_pid)) {
        case 0:
                break;

        case ECONNABORTED:
                libscf_handle_rebind(h);
                goto rep_retry;

        case ECANCELED:
                goto deleted;

        default:
                assert(0);
                abort();
        }

        if (inst->ri_i.i_primary_ctid >= 1) {
                contract_hash_store(inst->ri_i.i_primary_ctid, inst->ri_id);

                switch (check_contract(inst, B_TRUE, scf_inst)) {
                case 0:
                        break;

                case ECONNABORTED:
                        libscf_handle_rebind(h);
                        goto rep_retry;

                case ECANCELED:
                        goto deleted;

                default:
                        assert(0);
                        abort();
                }
        }

        if (inst->ri_i.i_transient_ctid >= 1) {
                switch (check_contract(inst, B_FALSE, scf_inst)) {
                case 0:
                        break;

                case ECONNABORTED:
                        libscf_handle_rebind(h);
                        goto rep_retry;

                case ECANCELED:
                        goto deleted;

                default:
                        assert(0);
                        abort();
                }
        }

        /* No more failures we live through, so add it to the list. */
        (void) pthread_mutex_init(&inst->ri_lock, &mutex_attrs);
        (void) pthread_mutex_init(&inst->ri_queue_lock, &mutex_attrs);
        MUTEX_LOCK(&inst->ri_lock);
        MUTEX_LOCK(&inst->ri_queue_lock);

        (void) pthread_cond_init(&inst->ri_method_cv, NULL);

        uu_list_node_init(inst, &inst->ri_link, restarter_instance_pool);
        uu_list_insert(instance_list.ril_instance_list, inst, idx);
        MUTEX_UNLOCK(&instance_list.ril_lock);

        if (start_pid != -1 &&
            (inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT) {
                int ret;
                ret = wait_register(start_pid, inst->ri_i.i_fmri, 0, 1);
                if (ret == -1) {
                        /*
                         * Implication:  if we can't reregister the
                         * instance, we will start another one.  Two
                         * instances may or may not result in a resource
                         * conflict.
                         */
                        log_error(LOG_WARNING,
                            "%s: couldn't reregister %ld for wait\n",
                            inst->ri_i.i_fmri, start_pid);
                } else if (ret == 1) {
                        /*
                         * Leading PID has exited.
                         */
                        (void) stop_instance(h, inst, RSTOP_EXIT);
                }
        }


        scf_pg_destroy(pg);

        if (do_commit_states)
                (void) restarter_instance_update_states(h, inst, state,
                    next_state, RERR_NONE, reason);

        log_framework(LOG_DEBUG, "%s is a %s-style service\n", name,
            service_style(inst->ri_flags));

        MUTEX_UNLOCK(&inst->ri_queue_lock);
        MUTEX_UNLOCK(&inst->ri_lock);

        startd_free(svc_name, max_scf_name_size);
        startd_free(inst_name, max_scf_name_size);
        scf_snapshot_destroy(snap);
        scf_instance_destroy(scf_inst);
        scf_service_destroy(scf_svc);

        log_framework(LOG_DEBUG, "%s: inserted instance into restarter list\n",
            name);

        return (0);

deleted:
        MUTEX_UNLOCK(&instance_list.ril_lock);
        startd_free(inst_name, max_scf_name_size);
        startd_free(svc_name, max_scf_name_size);
        if (snap != NULL)
                scf_snapshot_destroy(snap);
        scf_pg_destroy(pg);
        scf_instance_destroy(scf_inst);
        scf_service_destroy(scf_svc);
        startd_free((void *)inst->ri_i.i_fmri, strlen(inst->ri_i.i_fmri) + 1);
        uu_list_destroy(inst->ri_queue);
        if (inst->ri_logstem != NULL)
                startd_free(inst->ri_logstem, PATH_MAX);
        if (inst->ri_common_name != NULL)
                free(inst->ri_common_name);
        if (inst->ri_C_common_name != NULL)
                free(inst->ri_C_common_name);
        startd_free(inst->ri_utmpx_prefix, max_scf_value_size);
        startd_free(inst, sizeof (restarter_inst_t));
        return (ENOENT);
}

static void
restarter_delete_inst(restarter_inst_t *ri)
{
        int id;
        restarter_inst_t *rip;
        void *cookie = NULL;
        restarter_instance_qentry_t *e;

        assert(MUTEX_HELD(&ri->ri_lock));

        /*
         * Must drop the instance lock so we can pick up the instance_list
         * lock & remove the instance.
         */
        id = ri->ri_id;
        MUTEX_UNLOCK(&ri->ri_lock);

        MUTEX_LOCK(&instance_list.ril_lock);

        rip = uu_list_find(instance_list.ril_instance_list, &id, NULL, NULL);
        if (rip == NULL) {
                MUTEX_UNLOCK(&instance_list.ril_lock);
                return;
        }

        assert(ri == rip);

        uu_list_remove(instance_list.ril_instance_list, ri);

        log_framework(LOG_DEBUG, "%s: deleted instance from restarter list\n",
            ri->ri_i.i_fmri);

        MUTEX_UNLOCK(&instance_list.ril_lock);

        /*
         * We can lock the instance without holding the instance_list lock
         * since we removed the instance from the list.
         */
        MUTEX_LOCK(&ri->ri_lock);
        MUTEX_LOCK(&ri->ri_queue_lock);

        if (ri->ri_i.i_primary_ctid >= 1)
                contract_hash_remove(ri->ri_i.i_primary_ctid);

        while (ri->ri_method_thread != 0 || ri->ri_method_waiters > 0)
                (void) pthread_cond_wait(&ri->ri_method_cv, &ri->ri_lock);

        while ((e = uu_list_teardown(ri->ri_queue, &cookie)) != NULL)
                startd_free(e, sizeof (*e));
        uu_list_destroy(ri->ri_queue);

        startd_free((void *)ri->ri_i.i_fmri, strlen(ri->ri_i.i_fmri) + 1);
        startd_free(ri->ri_logstem, PATH_MAX);
        if (ri->ri_common_name != NULL)
                free(ri->ri_common_name);
        if (ri->ri_C_common_name != NULL)
                free(ri->ri_C_common_name);
        startd_free(ri->ri_utmpx_prefix, max_scf_value_size);
        (void) pthread_mutex_destroy(&ri->ri_lock);
        (void) pthread_mutex_destroy(&ri->ri_queue_lock);
        startd_free(ri, sizeof (restarter_inst_t));
}

/*
 * instance_is_wait_style()
 *
 *   Returns 1 if the given instance is a "wait-style" service instance.
 */
int
instance_is_wait_style(restarter_inst_t *inst)
{
        assert(MUTEX_HELD(&inst->ri_lock));
        return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_WAIT);
}

/*
 * instance_is_transient_style()
 *
 *   Returns 1 if the given instance is a transient service instance.
 */
int
instance_is_transient_style(restarter_inst_t *inst)
{
        assert(MUTEX_HELD(&inst->ri_lock));
        return ((inst->ri_flags & RINST_STYLE_MASK) == RINST_TRANSIENT);
}

/*
 * instance_in_transition()
 * Returns 1 if instance is in transition, 0 if not
 */
int
instance_in_transition(restarter_inst_t *inst)
{
        assert(MUTEX_HELD(&inst->ri_lock));
        if (inst->ri_i.i_next_state == RESTARTER_STATE_NONE)
                return (0);
        return (1);
}

/*
 * returns 1 if instance is already started, 0 if not
 */
static int
instance_started(restarter_inst_t *inst)
{
        int ret;

        assert(MUTEX_HELD(&inst->ri_lock));

        if (inst->ri_i.i_state == RESTARTER_STATE_ONLINE ||
            inst->ri_i.i_state == RESTARTER_STATE_DEGRADED) {
                ret = 1;
        } else {
                ret = 0;
        }

        return (ret);
}

/*
 * Returns
 *   0 - success
 *   ECONNRESET - success, but h was rebound
 */
int
restarter_instance_update_states(scf_handle_t *h, restarter_inst_t *ri,
    restarter_instance_state_t new_state,
    restarter_instance_state_t new_state_next, restarter_error_t err,
    restarter_str_t reason)
{
        protocol_states_t *states;
        int e;
        uint_t retry_count = 0, msecs = ALLOC_DELAY;
        boolean_t rebound = B_FALSE;
        int prev_state_online;
        int state_online;

        assert(MUTEX_HELD(&ri->ri_lock));

        prev_state_online = instance_started(ri);

retry:
        e = _restarter_commit_states(h, &ri->ri_i, new_state, new_state_next,
            restarter_get_str_short(reason));
        switch (e) {
        case 0:
                break;

        case ENOMEM:
                ++retry_count;
                if (retry_count < ALLOC_RETRY) {
                        (void) poll(NULL, 0, msecs);
                        msecs *= ALLOC_DELAY_MULT;
                        goto retry;
                }

                /* Like startd_alloc(). */
                uu_die("Insufficient memory.\n");
                /* NOTREACHED */

        case ECONNABORTED:
                libscf_handle_rebind(h);
                rebound = B_TRUE;
                goto retry;

        case EPERM:
        case EACCES:
        case EROFS:
                log_error(LOG_NOTICE, "Could not commit state change for %s "
                    "to repository: %s.\n", ri->ri_i.i_fmri, strerror(e));
                /* FALLTHROUGH */

        case ENOENT:
                ri->ri_i.i_state = new_state;
                ri->ri_i.i_next_state = new_state_next;
                break;

        case EINVAL:
        default:
                bad_error("_restarter_commit_states", e);
        }

        states = startd_alloc(sizeof (protocol_states_t));
        states->ps_state = new_state;
        states->ps_state_next = new_state_next;
        states->ps_err = err;
        states->ps_reason = reason;
        graph_protocol_send_event(ri->ri_i.i_fmri, GRAPH_UPDATE_STATE_CHANGE,
            (void *)states);

        state_online = instance_started(ri);

        if (prev_state_online && !state_online)
                ri->ri_post_offline_hook();
        else if (!prev_state_online && state_online)
                ri->ri_post_online_hook();

        return (rebound ? ECONNRESET : 0);
}

void
restarter_mark_pending_snapshot(const char *fmri, uint_t flag)
{
        restarter_inst_t *inst;

        assert(flag == RINST_RETAKE_RUNNING || flag == RINST_RETAKE_START);

        inst = inst_lookup_by_name(fmri);
        if (inst == NULL)
                return;

        inst->ri_flags |= flag;

        MUTEX_UNLOCK(&inst->ri_lock);
}

static void
restarter_take_pending_snapshots(scf_handle_t *h)
{
        restarter_inst_t *inst;
        int r;

        MUTEX_LOCK(&instance_list.ril_lock);

        for (inst = uu_list_first(instance_list.ril_instance_list);
            inst != NULL;
            inst = uu_list_next(instance_list.ril_instance_list, inst)) {
                const char *fmri;
                scf_instance_t *sinst = NULL;

                MUTEX_LOCK(&inst->ri_lock);

                /*
                 * This is where we'd check inst->ri_method_thread and if it
                 * were nonzero we'd wait in anticipation of another thread
                 * executing a method for inst.  Doing so with the instance_list
                 * locked, though, leads to deadlock.  Since taking a snapshot
                 * during that window won't hurt anything, we'll just continue.
                 */

                fmri = inst->ri_i.i_fmri;

                if (inst->ri_flags & RINST_RETAKE_RUNNING) {
                        scf_snapshot_t *rsnap;

                        (void) libscf_fmri_get_instance(h, fmri, &sinst);

                        rsnap = libscf_get_or_make_running_snapshot(sinst,
                            fmri, B_FALSE);

                        scf_instance_destroy(sinst);

                        if (rsnap != NULL)
                                inst->ri_flags &= ~RINST_RETAKE_RUNNING;

                        scf_snapshot_destroy(rsnap);
                }

                if (inst->ri_flags & RINST_RETAKE_START) {
                        switch (r = libscf_snapshots_poststart(h, fmri,
                            B_FALSE)) {
                        case 0:
                        case ENOENT:
                                inst->ri_flags &= ~RINST_RETAKE_START;
                                break;

                        case ECONNABORTED:
                                break;

                        case EACCES:
                        default:
                                bad_error("libscf_snapshots_poststart", r);
                        }
                }

                MUTEX_UNLOCK(&inst->ri_lock);
        }

        MUTEX_UNLOCK(&instance_list.ril_lock);
}

/* ARGSUSED */
void *
restarter_post_fsminimal_thread(void *unused)
{
        scf_handle_t *h;
        int r;

        (void) pthread_setname_np(pthread_self(), "restarter_post_fsmin");

        h = libscf_handle_create_bound_loop();

        for (;;) {
                r = libscf_create_self(h);
                if (r == 0)
                        break;

                assert(r == ECONNABORTED);
                libscf_handle_rebind(h);
        }

        restarter_take_pending_snapshots(h);

        (void) scf_handle_unbind(h);
        scf_handle_destroy(h);

        return (NULL);
}

/*
 * int stop_instance()
 *
 *   Stop the instance identified by the instance given as the second argument,
 *   for the cause stated.
 *
 *   Returns
 *     0 - success
 *     -1 - inst is in transition
 */
static int
stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
    stop_cause_t cause)
{
        fork_info_t *info;
        const char *cp;
        int err;
        restarter_error_t re;
        restarter_str_t reason;
        restarter_instance_state_t new_state;

        assert(MUTEX_HELD(&inst->ri_lock));
        assert(inst->ri_method_thread == 0);

        switch (cause) {
        case RSTOP_EXIT:
                re = RERR_RESTART;
                reason = restarter_str_ct_ev_exit;
                cp = "all processes in service exited";
                break;
        case RSTOP_ERR_CFG:
                re = RERR_FAULT;
                reason = restarter_str_method_failed;
                cp = "service exited with a configuration error";
                break;
        case RSTOP_ERR_EXIT:
                re = RERR_RESTART;
                reason = restarter_str_ct_ev_exit;
                cp = "service exited with an error";
                break;
        case RSTOP_CORE:
                re = RERR_FAULT;
                reason = restarter_str_ct_ev_core;
                cp = "process dumped core";
                break;
        case RSTOP_SIGNAL:
                re = RERR_FAULT;
                reason = restarter_str_ct_ev_signal;
                cp = "process received fatal signal from outside the service";
                break;
        case RSTOP_HWERR:
                re = RERR_FAULT;
                reason = restarter_str_ct_ev_hwerr;
                cp = "process killed due to uncorrectable hardware error";
                break;
        case RSTOP_DEPENDENCY:
                re = RERR_RESTART;
                reason = restarter_str_dependency_activity;
                cp = "dependency activity requires stop";
                break;
        case RSTOP_DISABLE:
                re = RERR_RESTART;
                reason = restarter_str_disable_request;
                cp = "service disabled";
                break;
        case RSTOP_RESTART:
                re = RERR_RESTART;
                reason = restarter_str_restart_request;
                cp = "service restarting";
                break;
        default:
#ifndef NDEBUG
                (void) fprintf(stderr, "Unknown cause %d at %s:%d.\n",
                    cause, __FILE__, __LINE__);
#endif
                abort();
        }

        /* Services in the disabled and maintenance state are ignored */
        if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
            inst->ri_i.i_state == RESTARTER_STATE_DISABLED) {
                log_framework(LOG_DEBUG,
                    "%s: stop_instance -> is maint/disabled\n",
                    inst->ri_i.i_fmri);
                return (0);
        }

        /* Already stopped instances are left alone */
        if (instance_started(inst) == 0) {
                log_framework(LOG_DEBUG, "Restarter: %s is already stopped.\n",
                    inst->ri_i.i_fmri);
                return (0);
        }

        if (instance_in_transition(inst)) {
                /* requeue event by returning -1 */
                log_framework(LOG_DEBUG,
                    "Restarter: Not stopping %s, in transition.\n",
                    inst->ri_i.i_fmri);
                return (-1);
        }

        log_instance(inst, B_TRUE, "Stopping because %s.", cp);

        log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG,
            "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp);

        if (instance_is_wait_style(inst) &&
            (cause == RSTOP_EXIT ||
            cause == RSTOP_ERR_CFG ||
            cause == RSTOP_ERR_EXIT)) {
                /*
                 * No need to stop instance, as child has exited; remove
                 * contract and move the instance to the offline state.
                 */
                switch (err = restarter_instance_update_states(local_handle,
                    inst, inst->ri_i.i_state, RESTARTER_STATE_OFFLINE, re,
                    reason)) {
                case 0:
                case ECONNRESET:
                        break;

                default:
                        bad_error("restarter_instance_update_states", err);
                }

                if (cause == RSTOP_ERR_EXIT) {
                        /*
                         * The RSTOP_ERR_EXIT cause is set via the
                         * wait_thread -> wait_remove code path when we have
                         * a "wait" style svc that exited with an error. If
                         * the svc is failing too quickly, we throttle it so
                         * that we don't restart it more than once/second.
                         * Since we know we're running in the wait thread its
                         * ok to throttle it right here.
                         */
                        (void) update_fault_count(inst, FAULT_COUNT_INCR);
                        if (method_rate_critical(inst)) {
                                log_instance(inst, B_TRUE, "Failing too "
                                    "quickly, throttling.");
                                (void) sleep(WT_SVC_ERR_THROTTLE);
                        }
                } else {
                        (void) update_fault_count(inst, FAULT_COUNT_RESET);
                        reset_start_times(inst);
                }

                if (inst->ri_i.i_primary_ctid != 0) {
                        inst->ri_m_inst =
                            safe_scf_instance_create(local_handle);
                        inst->ri_mi_deleted = B_FALSE;

                        libscf_reget_instance(inst);
                        method_remove_contract(inst, B_TRUE, B_TRUE);

                        scf_instance_destroy(inst->ri_m_inst);
                        inst->ri_m_inst = NULL;
                }

                switch (err = restarter_instance_update_states(local_handle,
                    inst, inst->ri_i.i_next_state, RESTARTER_STATE_NONE, re,
                    reason)) {
                case 0:
                case ECONNRESET:
                        break;

                default:
                        bad_error("restarter_instance_update_states", err);
                }

                if (cause != RSTOP_ERR_CFG)
                        return (0);
        } else if (instance_is_wait_style(inst) && re == RERR_RESTART) {
                /*
                 * Stopping a wait service through means other than the pid
                 * exiting should keep wait_thread() from restarting the
                 * service, by removing it from the wait list.
                 * We cannot remove it right now otherwise the process will
                 * end up <defunct> so mark it to be ignored.
                 */
                wait_ignore_by_fmri(inst->ri_i.i_fmri);
        }

        /*
         * There are some configuration errors which we cannot detect until we
         * try to run the method.  For example, see exec_method() where the
         * restarter_set_method_context() call can return SMF_EXIT_ERR_CONFIG
         * in several cases. If this happens for a "wait-style" svc,
         * wait_remove() sets the cause as RSTOP_ERR_CFG so that we can detect
         * the configuration error and go into maintenance, even though it is
         * a "wait-style" svc.
         */
        if (cause == RSTOP_ERR_CFG)
                new_state = RESTARTER_STATE_MAINT;
        else
                new_state = inst->ri_i.i_enabled ?
                    RESTARTER_STATE_OFFLINE : RESTARTER_STATE_DISABLED;

        switch (err = restarter_instance_update_states(local_handle, inst,
            inst->ri_i.i_state, new_state, RERR_NONE, reason)) {
        case 0:
        case ECONNRESET:
                break;

        default:
                bad_error("restarter_instance_update_states", err);
        }

        info = startd_zalloc(sizeof (fork_info_t));

        info->sf_id = inst->ri_id;
        info->sf_method_type = METHOD_STOP;
        info->sf_event_type = re;
        info->sf_reason = reason;
        inst->ri_method_thread = startd_thread_create(method_thread, info);

        return (0);
}

/*
 * Returns
 *   ENOENT - fmri is not in instance_list
 *   0 - success
 *   ECONNRESET - success, though handle was rebound
 *   -1 - instance is in transition
 */
int
stop_instance_fmri(scf_handle_t *h, const char *fmri, uint_t flags)
{
        restarter_inst_t *rip;
        int r;

        rip = inst_lookup_by_name(fmri);
        if (rip == NULL)
                return (ENOENT);

        r = stop_instance(h, rip, flags);

        MUTEX_UNLOCK(&rip->ri_lock);

        return (r);
}

static void
unmaintain_instance(scf_handle_t *h, restarter_inst_t *rip,
    unmaint_cause_t cause)
{
        ctid_t ctid;
        scf_instance_t *inst;
        int r;
        uint_t tries = 0, msecs = ALLOC_DELAY;
        const char *cp;
        restarter_str_t reason;

        assert(MUTEX_HELD(&rip->ri_lock));

        if (rip->ri_i.i_state != RESTARTER_STATE_MAINT) {
                log_error(LOG_DEBUG, "Restarter: "
                    "Ignoring maintenance off command because %s is not in the "
                    "maintenance state.\n", rip->ri_i.i_fmri);
                return;
        }

        switch (cause) {
        case RUNMAINT_CLEAR:
                cp = "clear requested";
                reason = restarter_str_clear_request;
                break;
        case RUNMAINT_DISABLE:
                cp = "disable requested";
                reason = restarter_str_disable_request;
                break;
        default:
#ifndef NDEBUG
                (void) fprintf(stderr, "Uncaught case for %d at %s:%d.\n",
                    cause, __FILE__, __LINE__);
#endif
                abort();
        }

        log_instance(rip, B_TRUE, "Leaving maintenance because %s.",
            cp);
        log_framework(LOG_DEBUG, "%s: Instance leaving maintenance because "
            "%s.\n", rip->ri_i.i_fmri, cp);

        (void) restarter_instance_update_states(h, rip, RESTARTER_STATE_UNINIT,
            RESTARTER_STATE_NONE, RERR_RESTART, reason);

        /*
         * If we did ADMIN_MAINT_ON_IMMEDIATE, then there might still be
         * a primary contract.
         */
        if (rip->ri_i.i_primary_ctid == 0)
                return;

        ctid = rip->ri_i.i_primary_ctid;
        contract_abandon(ctid);
        rip->ri_i.i_primary_ctid = 0;

rep_retry:
        switch (r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst)) {
        case 0:
                break;

        case ECONNABORTED:
                libscf_handle_rebind(h);
                goto rep_retry;

        case ENOENT:
                /* Must have been deleted. */
                return;

        case EINVAL:
        case ENOTSUP:
        default:
                bad_error("libscf_handle_rebind", r);
        }

again:
        r = restarter_remove_contract(inst, ctid, RESTARTER_CONTRACT_PRIMARY);
        switch (r) {
        case 0:
                break;

        case ENOMEM:
                ++tries;
                if (tries < ALLOC_RETRY) {
                        (void) poll(NULL, 0, msecs);
                        msecs *= ALLOC_DELAY_MULT;
                        goto again;
                }

                uu_die("Insufficient memory.\n");
                /* NOTREACHED */

        case ECONNABORTED:
                scf_instance_destroy(inst);
                libscf_handle_rebind(h);
                goto rep_retry;

        case ECANCELED:
                break;

        case EPERM:
        case EACCES:
        case EROFS:
                log_error(LOG_INFO,
                    "Could not remove contract id %lu for %s (%s).\n", ctid,
                    rip->ri_i.i_fmri, strerror(r));
                break;

        case EINVAL:
        case EBADF:
        default:
                bad_error("restarter_remove_contract", r);
        }

        scf_instance_destroy(inst);
}

/*
 * enable_inst()
 *   Set inst->ri_i.i_enabled.  Expects 'e' to be _ENABLE, _DISABLE, or
 *   _ADMIN_DISABLE.  If the event is _ENABLE and inst is uninitialized or
 *   disabled, move it to offline.  If the event is _DISABLE or
 *   _ADMIN_DISABLE, make sure inst will move to disabled.
 *
 *   Returns
 *     0 - success
 *     ECONNRESET - h was rebound
 */
static int
enable_inst(scf_handle_t *h, restarter_inst_t *inst,
    restarter_instance_qentry_t *riq)
{
        restarter_instance_state_t state;
        restarter_event_type_t e = riq->riq_type;
        restarter_str_t reason = restarter_str_per_configuration;
        int r;

        assert(MUTEX_HELD(&inst->ri_lock));
        assert(e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE ||
            e == RESTARTER_EVENT_TYPE_DISABLE ||
            e == RESTARTER_EVENT_TYPE_ENABLE);
        assert(instance_in_transition(inst) == 0);

        state = inst->ri_i.i_state;

        if (e == RESTARTER_EVENT_TYPE_ENABLE) {
                inst->ri_i.i_enabled = 1;

                if (state == RESTARTER_STATE_UNINIT ||
                    state == RESTARTER_STATE_DISABLED) {
                        /*
                         * B_FALSE: Don't log an error if the log_instance()
                         * fails because it will fail on the miniroot before
                         * install-discovery runs.
                         */
                        log_instance(inst, B_FALSE, "Enabled.");
                        log_framework(LOG_DEBUG, "%s: Instance enabled.\n",
                            inst->ri_i.i_fmri);

                        /*
                         * If we are coming from DISABLED, it was obviously an
                         * enable request. If we are coming from UNINIT, it may
                         * have been a sevice in MAINT that was cleared.
                         */
                        if (riq->riq_reason == restarter_str_clear_request)
                                reason = restarter_str_clear_request;
                        else if (state == RESTARTER_STATE_DISABLED)
                                reason = restarter_str_enable_request;
                        (void) restarter_instance_update_states(h, inst,
                            RESTARTER_STATE_OFFLINE, RESTARTER_STATE_NONE,
                            RERR_NONE, reason);
                } else {
                        log_framework(LOG_DEBUG, "Restarter: "
                            "Not changing state of %s for enable command.\n",
                            inst->ri_i.i_fmri);
                }
        } else {
                inst->ri_i.i_enabled = 0;

                switch (state) {
                case RESTARTER_STATE_ONLINE:
                case RESTARTER_STATE_DEGRADED:
                        r = stop_instance(h, inst, RSTOP_DISABLE);
                        return (r == ECONNRESET ? 0 : r);

                case RESTARTER_STATE_OFFLINE:
                case RESTARTER_STATE_UNINIT:
                        if (inst->ri_i.i_primary_ctid != 0) {
                                inst->ri_m_inst = safe_scf_instance_create(h);
                                inst->ri_mi_deleted = B_FALSE;

                                libscf_reget_instance(inst);
                                method_remove_contract(inst, B_TRUE, B_TRUE);

                                scf_instance_destroy(inst->ri_m_inst);
                        }
                        /* B_FALSE: See log_instance(..., "Enabled."); above */
                        log_instance(inst, B_FALSE, "Disabled.");
                        log_framework(LOG_DEBUG, "%s: Instance disabled.\n",
                            inst->ri_i.i_fmri);

                        /*
                         * If we are coming from OFFLINE, it was obviously a
                         * disable request. But if we are coming from
                         * UNINIT, it may have been a disable request for a
                         * service in MAINT.
                         */
                        if (riq->riq_reason == restarter_str_disable_request ||
                            state == RESTARTER_STATE_OFFLINE)
                                reason = restarter_str_disable_request;
                        (void) restarter_instance_update_states(h, inst,
                            RESTARTER_STATE_DISABLED, RESTARTER_STATE_NONE,
                            RERR_RESTART, reason);
                        return (0);

                case RESTARTER_STATE_DISABLED:
                        break;

                case RESTARTER_STATE_MAINT:
                        /*
                         * We only want to pull the instance out of maintenance
                         * if the disable is on adminstrative request.  The
                         * graph engine sends _DISABLE events whenever a
                         * service isn't in the disabled state, and we don't
                         * want to pull the service out of maintenance if,
                         * for example, it is there due to a dependency cycle.
                         */
                        if (e == RESTARTER_EVENT_TYPE_ADMIN_DISABLE)
                                unmaintain_instance(h, inst, RUNMAINT_DISABLE);
                        break;

                default:
#ifndef NDEBUG
                        (void) fprintf(stderr, "Restarter instance %s has "
                            "unknown state %d.\n", inst->ri_i.i_fmri, state);
#endif
                        abort();
                }
        }

        return (0);
}

static void
start_instance(scf_handle_t *local_handle, restarter_inst_t *inst,
    int32_t reason)
{
        fork_info_t *info;
        restarter_str_t new_reason;

        assert(MUTEX_HELD(&inst->ri_lock));
        assert(instance_in_transition(inst) == 0);
        assert(inst->ri_method_thread == 0);

        log_framework(LOG_DEBUG, "%s: trying to start instance\n",
            inst->ri_i.i_fmri);

        /*
         * We want to keep the original reason for restarts and clear actions
         */
        switch (reason) {
        case restarter_str_restart_request:
        case restarter_str_clear_request:
                new_reason = reason;
                break;
        default:
                new_reason = restarter_str_dependencies_satisfied;
        }

        /* Services in the disabled and maintenance state are ignored */
        if (inst->ri_i.i_state == RESTARTER_STATE_MAINT ||
            inst->ri_i.i_state == RESTARTER_STATE_DISABLED ||
            inst->ri_i.i_enabled == 0) {
                log_framework(LOG_DEBUG,
                    "%s: start_instance -> is maint/disabled\n",
                    inst->ri_i.i_fmri);
                return;
        }

        /* Already started instances are left alone */
        if (instance_started(inst) == 1) {
                log_framework(LOG_DEBUG,
                    "%s: start_instance -> is already started\n",
                    inst->ri_i.i_fmri);
                return;
        }

        log_framework(LOG_DEBUG, "%s: starting instance.\n", inst->ri_i.i_fmri);

        (void) restarter_instance_update_states(local_handle, inst,
            inst->ri_i.i_state, RESTARTER_STATE_ONLINE, RERR_NONE, new_reason);

        info = startd_zalloc(sizeof (fork_info_t));

        info->sf_id = inst->ri_id;
        info->sf_method_type = METHOD_START;
        info->sf_event_type = RERR_NONE;
        info->sf_reason = new_reason;
        inst->ri_method_thread = startd_thread_create(method_thread, info);
}

static int
event_from_tty(scf_handle_t *h, restarter_inst_t *rip)
{
        scf_instance_t *inst;
        int ret = 0;

        if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
                return (-1);

        ret = restarter_inst_ractions_from_tty(inst);

        scf_instance_destroy(inst);
        return (ret);
}

static boolean_t
restart_dump(scf_handle_t *h, restarter_inst_t *rip)
{
        scf_instance_t *inst;
        boolean_t ret = B_FALSE;

        if (libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst))
                return (-1);

        if (restarter_inst_dump(inst) == 1)
                ret = B_TRUE;

        scf_instance_destroy(inst);
        return (ret);
}

static void
maintain_instance(scf_handle_t *h, restarter_inst_t *rip, int immediate,
    restarter_str_t reason)
{
        fork_info_t *info;
        scf_instance_t *scf_inst = NULL;

        assert(MUTEX_HELD(&rip->ri_lock));
        assert(reason != restarter_str_none);
        assert(rip->ri_method_thread == 0);

        log_instance(rip, B_TRUE, "Stopping for maintenance due to %s.",
            restarter_get_str_short(reason));
        log_framework(LOG_DEBUG, "%s: stopping for maintenance due to %s.\n",
            rip->ri_i.i_fmri, restarter_get_str_short(reason));

        /* Services in the maintenance state are ignored */
        if (rip->ri_i.i_state == RESTARTER_STATE_MAINT) {
                log_framework(LOG_DEBUG,
                    "%s: maintain_instance -> is already in maintenance\n",
                    rip->ri_i.i_fmri);
                return;
        }

        /*
         * If reason state is restarter_str_service_request and
         * restarter_actions/auxiliary_fmri property is set with a valid fmri,
         * copy the fmri to restarter/auxiliary_fmri so svcs -x can use.
         */
        if (reason == restarter_str_service_request &&
            libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &scf_inst) == 0) {
                if (restarter_inst_validate_ractions_aux_fmri(scf_inst) == 0) {
                        if (restarter_inst_set_aux_fmri(scf_inst))
                                log_framework(LOG_DEBUG, "%s: "
                                    "restarter_inst_set_aux_fmri failed: ",
                                    rip->ri_i.i_fmri);
                } else {
                        log_framework(LOG_DEBUG, "%s: "
                            "restarter_inst_validate_ractions_aux_fmri "
                            "failed: ", rip->ri_i.i_fmri);

                        if (restarter_inst_reset_aux_fmri(scf_inst))
                                log_framework(LOG_DEBUG, "%s: "
                                    "restarter_inst_reset_aux_fmri failed: ",
                                    rip->ri_i.i_fmri);
                }
                scf_instance_destroy(scf_inst);
        }

        if (immediate || !instance_started(rip)) {
                if (rip->ri_i.i_primary_ctid != 0) {
                        rip->ri_m_inst = safe_scf_instance_create(h);
                        rip->ri_mi_deleted = B_FALSE;

                        libscf_reget_instance(rip);
                        method_remove_contract(rip, B_TRUE, B_TRUE);

                        scf_instance_destroy(rip->ri_m_inst);
                }

                (void) restarter_instance_update_states(h, rip,
                    RESTARTER_STATE_MAINT, RESTARTER_STATE_NONE, RERR_RESTART,
                    reason);
                return;
        }

        (void) restarter_instance_update_states(h, rip, rip->ri_i.i_state,
            RESTARTER_STATE_MAINT, RERR_NONE, reason);

        log_transition(rip, MAINT_REQUESTED);

        info = startd_zalloc(sizeof (*info));
        info->sf_id = rip->ri_id;
        info->sf_method_type = METHOD_STOP;
        info->sf_event_type = RERR_RESTART;
        info->sf_reason = reason;
        rip->ri_method_thread = startd_thread_create(method_thread, info);
}

static void
refresh_instance(scf_handle_t *h, restarter_inst_t *rip)
{
        scf_instance_t *inst;
        scf_snapshot_t *snap;
        fork_info_t *info;
        int r;

        assert(MUTEX_HELD(&rip->ri_lock));

        log_instance(rip, B_TRUE, "Rereading configuration.");
        log_framework(LOG_DEBUG, "%s: rereading configuration.\n",
            rip->ri_i.i_fmri);

rep_retry:
        r = libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &inst);
        switch (r) {
        case 0:
                break;

        case ECONNABORTED:
                libscf_handle_rebind(h);
                goto rep_retry;

        case ENOENT:
                /* Must have been deleted. */
                return;

        case EINVAL:
        case ENOTSUP:
        default:
                bad_error("libscf_fmri_get_instance", r);
        }

        snap = libscf_get_running_snapshot(inst);

        r = libscf_get_startd_properties(inst, snap, &rip->ri_flags,
            &rip->ri_utmpx_prefix);
        switch (r) {
        case 0:
                log_framework(LOG_DEBUG, "%s is a %s-style service\n",
                    rip->ri_i.i_fmri, service_style(rip->ri_flags));
                break;

        case ECONNABORTED:
                scf_instance_destroy(inst);
                scf_snapshot_destroy(snap);
                libscf_handle_rebind(h);
                goto rep_retry;

        case ECANCELED:
        case ENOENT:
                /* Succeed in anticipation of REMOVE_INSTANCE. */
                break;

        default:
                bad_error("libscf_get_startd_properties", r);
        }

        if (instance_started(rip)) {
                /* Refresh does not change the state. */
                (void) restarter_instance_update_states(h, rip,
                    rip->ri_i.i_state, rip->ri_i.i_state, RERR_NONE,
                    restarter_str_refresh);

                info = startd_zalloc(sizeof (*info));
                info->sf_id = rip->ri_id;
                info->sf_method_type = METHOD_REFRESH;
                info->sf_event_type = RERR_REFRESH;
                info->sf_reason = 0;

                assert(rip->ri_method_thread == 0);
                rip->ri_method_thread =
                    startd_thread_create(method_thread, info);
        }

        scf_snapshot_destroy(snap);
        scf_instance_destroy(inst);
}

static void
degrade_instance(scf_handle_t *h, restarter_inst_t *rip, restarter_str_t reason)
{
        scf_instance_t *scf_inst = NULL;

        assert(MUTEX_HELD(&rip->ri_lock));

        log_instance(rip, B_TRUE, "Marking degraded due to %s.",
            restarter_get_str_short(reason));
        log_framework(LOG_DEBUG, "%s: marking degraded due to %s.\n",
            rip->ri_i.i_fmri, restarter_get_str_short(reason));

        /* Services that aren't online are ignored */
        if (rip->ri_i.i_state != RESTARTER_STATE_ONLINE) {
                log_framework(LOG_DEBUG,
                    "%s: degrade_instance -> is not online\n",
                    rip->ri_i.i_fmri);
                return;
        }

        /*
         * If reason state is restarter_str_service_request and
         * restarter_actions/auxiliary_fmri property is set with a valid fmri,
         * copy the fmri to restarter/auxiliary_fmri so svcs -x can use.
         */
        if (reason == restarter_str_service_request &&
            libscf_fmri_get_instance(h, rip->ri_i.i_fmri, &scf_inst) == 0) {
                if (restarter_inst_validate_ractions_aux_fmri(scf_inst) == 0) {
                        if (restarter_inst_set_aux_fmri(scf_inst)) {
                                log_framework(LOG_DEBUG, "%s: "
                                    "restarter_inst_set_aux_fmri failed: ",
                                    rip->ri_i.i_fmri);
                        }
                } else {
                        log_framework(LOG_DEBUG, "%s: "
                            "restarter_inst_validate_ractions_aux_fmri "
                            "failed: ", rip->ri_i.i_fmri);

                        if (restarter_inst_reset_aux_fmri(scf_inst)) {
                                log_framework(LOG_DEBUG, "%s: "
                                    "restarter_inst_reset_aux_fmri failed: ",
                                    rip->ri_i.i_fmri);
                        }
                }
                scf_instance_destroy(scf_inst);
        }

        (void) restarter_instance_update_states(h, rip,
            RESTARTER_STATE_DEGRADED, RESTARTER_STATE_NONE, RERR_NONE, reason);

        log_transition(rip, DEGRADE_REQUESTED);
}

/*
 * Note that the ordering of these must match the restarter event types defined
 * in librestart.h
 */
const char *event_names[] = { "INVALID", "ADD_INSTANCE", "REMOVE_INSTANCE",
        "ENABLE", "DISABLE", "ADMIN_DEGRADED", "ADMIN_REFRESH",
        "ADMIN_RESTART", "ADMIN_MAINT_OFF", "ADMIN_MAINT_ON",
        "ADMIN_MAINT_ON_IMMEDIATE", "STOP", "START", "DEPENDENCY_CYCLE",
        "INVALID_DEPENDENCY", "ADMIN_DISABLE", "STOP_RESET",
        "ADMIN_DEGRADE_IMMEDIATE", "ADMIN_RESTORE"
};

/*
 * void *restarter_process_events()
 *
 *   Called in a separate thread to process the events on an instance's
 *   queue.  Empties the queue completely, and tries to keep the thread
 *   around for a little while after the queue is empty to save on
 *   startup costs.
 */
static void *
restarter_process_events(void *arg)
{
        scf_handle_t *h;
        restarter_instance_qentry_t *event;
        restarter_inst_t *rip;
        char *fmri = (char *)arg;
        struct timespec to;

        (void) pthread_setname_np(pthread_self(), "restarter_process_events");

        assert(fmri != NULL);

        h = libscf_handle_create_bound_loop();

        /* grab the queue lock */
        rip = inst_lookup_queue(fmri);
        if (rip == NULL)
                goto out;

again:

        while ((event = uu_list_first(rip->ri_queue)) != NULL) {
                restarter_inst_t *inst;

                /* drop the queue lock */
                MUTEX_UNLOCK(&rip->ri_queue_lock);

                /*
                 * Grab the inst lock -- this waits until any outstanding
                 * method finishes running.
                 */
                inst = inst_lookup_by_name(fmri);
                if (inst == NULL) {
                        /* Getting deleted in the middle isn't an error. */
                        goto cont;
                }

                assert(instance_in_transition(inst) == 0);

                /* process the event */
                switch (event->riq_type) {
                case RESTARTER_EVENT_TYPE_ENABLE:
                case RESTARTER_EVENT_TYPE_DISABLE:
                        (void) enable_inst(h, inst, event);
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_DISABLE:
                        if (enable_inst(h, inst, event) == 0)
                                reset_start_times(inst);
                        break;

                case RESTARTER_EVENT_TYPE_REMOVE_INSTANCE:
                        restarter_delete_inst(inst);
                        inst = NULL;
                        goto cont;

                case RESTARTER_EVENT_TYPE_STOP_RESET:
                        reset_start_times(inst);
                        /* FALLTHROUGH */
                case RESTARTER_EVENT_TYPE_STOP:
                        (void) stop_instance(h, inst, RSTOP_DEPENDENCY);
                        break;

                case RESTARTER_EVENT_TYPE_START:
                        start_instance(h, inst, event->riq_reason);
                        break;

                case RESTARTER_EVENT_TYPE_DEPENDENCY_CYCLE:
                        maintain_instance(h, inst, 0,
                            restarter_str_dependency_cycle);
                        break;

                case RESTARTER_EVENT_TYPE_INVALID_DEPENDENCY:
                        maintain_instance(h, inst, 0,
                            restarter_str_invalid_dependency);
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
                        if (event_from_tty(h, inst) == 0)
                                maintain_instance(h, inst, 0,
                                    restarter_str_service_request);
                        else
                                maintain_instance(h, inst, 0,
                                    restarter_str_administrative_request);
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
                        if (event_from_tty(h, inst) == 0)
                                maintain_instance(h, inst, 1,
                                    restarter_str_service_request);
                        else
                                maintain_instance(h, inst, 1,
                                    restarter_str_administrative_request);
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
                        unmaintain_instance(h, inst, RUNMAINT_CLEAR);
                        reset_start_times(inst);
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
                        refresh_instance(h, inst);
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
                case RESTARTER_EVENT_TYPE_ADMIN_DEGRADE_IMMEDIATE:
                        if (event_from_tty(h, inst) == 0) {
                                degrade_instance(h, inst,
                                    restarter_str_service_request);
                        } else {
                                degrade_instance(h, inst,
                                    restarter_str_administrative_request);
                        }
                        break;

                case RESTARTER_EVENT_TYPE_ADMIN_RESTORE:
                case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
                        if (!instance_started(inst)) {
                                log_framework(LOG_DEBUG, "Restarter: "
                                    "Not restarting %s; not running.\n",
                                    inst->ri_i.i_fmri);
                        } else {
                                /*
                                 * Stop the instance.  If it can be restarted,
                                 * the graph engine will send a new event.
                                 */
                                if (restart_dump(h, inst)) {
                                        (void) contract_kill(
                                            inst->ri_i.i_primary_ctid, SIGABRT,
                                            inst->ri_i.i_fmri);
                                } else if (stop_instance(h, inst,
                                    RSTOP_RESTART) == 0) {
                                        reset_start_times(inst);
                                }
                        }
                        break;

                case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
                default:
#ifndef NDEBUG
                        uu_warn("%s:%d: Bad restarter event %d.  "
                            "Aborting.\n", __FILE__, __LINE__, event->riq_type);
#endif
                        abort();
                }

                assert(inst != NULL);
                MUTEX_UNLOCK(&inst->ri_lock);

cont:
                /* grab the queue lock */
                rip = inst_lookup_queue(fmri);
                if (rip == NULL)
                        goto out;

                /* delete the event */
                uu_list_remove(rip->ri_queue, event);
                startd_free(event, sizeof (restarter_instance_qentry_t));
        }

        assert(rip != NULL);

        /*
         * Try to preserve the thread for a little while for future use.
         */
        to.tv_sec = 3;
        to.tv_nsec = 0;
        (void) pthread_cond_reltimedwait_np(&rip->ri_queue_cv,
            &rip->ri_queue_lock, &to);

        if (uu_list_first(rip->ri_queue) != NULL)
                goto again;

        rip->ri_queue_thread = 0;
        MUTEX_UNLOCK(&rip->ri_queue_lock);

out:
        (void) scf_handle_unbind(h);
        scf_handle_destroy(h);
        free(fmri);
        return (NULL);
}

static int
is_admin_event(restarter_event_type_t t)
{
        switch (t) {
        case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON:
        case RESTARTER_EVENT_TYPE_ADMIN_MAINT_ON_IMMEDIATE:
        case RESTARTER_EVENT_TYPE_ADMIN_MAINT_OFF:
        case RESTARTER_EVENT_TYPE_ADMIN_REFRESH:
        case RESTARTER_EVENT_TYPE_ADMIN_RESTORE:
        case RESTARTER_EVENT_TYPE_ADMIN_DEGRADED:
        case RESTARTER_EVENT_TYPE_ADMIN_DEGRADE_IMMEDIATE:
        case RESTARTER_EVENT_TYPE_ADMIN_RESTART:
                return (1);
        default:
                return (0);
        }
}

static void
restarter_queue_event(restarter_inst_t *ri, restarter_protocol_event_t *e)
{
        restarter_instance_qentry_t *qe;
        int r;

        assert(MUTEX_HELD(&ri->ri_queue_lock));
        assert(!MUTEX_HELD(&ri->ri_lock));

        qe = startd_zalloc(sizeof (restarter_instance_qentry_t));
        qe->riq_type = e->rpe_type;
        qe->riq_reason = e->rpe_reason;

        uu_list_node_init(qe, &qe->riq_link, restarter_queue_pool);
        r = uu_list_insert_before(ri->ri_queue, NULL, qe);
        assert(r == 0);
}

/*
 * void *restarter_event_thread()
 *
 *  Handle incoming graph events by placing them on a per-instance
 *  queue.  We can't lock the main part of the instance structure, so
 *  just modify the seprarately locked event queue portion.
 */
/*ARGSUSED*/
static void *
restarter_event_thread(void *unused)
{
        scf_handle_t *h;

        (void) pthread_setname_np(pthread_self(), "restarter_event");

        /*
         * This is a new thread, and thus, gets its own handle
         * to the repository.
         */
        h = libscf_handle_create_bound_loop();

        MUTEX_LOCK(&ru->restarter_update_lock);

        /*CONSTCOND*/
        while (1) {
                restarter_protocol_event_t *e;

                while (ru->restarter_update_wakeup == 0)
                        (void) pthread_cond_wait(&ru->restarter_update_cv,
                            &ru->restarter_update_lock);

                ru->restarter_update_wakeup = 0;

                while ((e = restarter_event_dequeue()) != NULL) {
                        restarter_inst_t *rip;
                        char *fmri;

                        MUTEX_UNLOCK(&ru->restarter_update_lock);

                        /*
                         * ADD_INSTANCE is special: there's likely no
                         * instance structure yet, so we need to handle the
                         * addition synchronously.
                         */
                        switch (e->rpe_type) {
                        case RESTARTER_EVENT_TYPE_ADD_INSTANCE:
                                if (restarter_insert_inst(h, e->rpe_inst) != 0)
                                        log_error(LOG_INFO, "Restarter: "
                                            "Could not add %s.\n", e->rpe_inst);

                                MUTEX_LOCK(&st->st_load_lock);
                                if (--st->st_load_instances == 0)
                                        (void) pthread_cond_broadcast(
                                            &st->st_load_cv);
                                MUTEX_UNLOCK(&st->st_load_lock);

                                goto nolookup;
                        }

                        /*
                         * Lookup the instance, locking only the event queue.
                         * Can't grab ri_lock here because it might be held
                         * by a long-running method.
                         */
                        rip = inst_lookup_queue(e->rpe_inst);
                        if (rip == NULL) {
                                log_error(LOG_INFO, "Restarter: "
                                    "Ignoring %s command for unknown service "
                                    "%s.\n", event_names[e->rpe_type],
                                    e->rpe_inst);
                                goto nolookup;
                        }

                        /* Keep ADMIN events from filling up the queue. */
                        if (is_admin_event(e->rpe_type) &&
                            uu_list_numnodes(rip->ri_queue) >
                            RINST_QUEUE_THRESHOLD) {
                                MUTEX_UNLOCK(&rip->ri_queue_lock);
                                log_instance(rip, B_TRUE, "Instance event "
                                    "queue overflow.  Dropping administrative "
                                    "request.");
                                log_framework(LOG_DEBUG, "%s: Instance event "
                                    "queue overflow.  Dropping administrative "
                                    "request.\n", rip->ri_i.i_fmri);
                                goto nolookup;
                        }

                        /* Now add the event to the instance queue. */
                        restarter_queue_event(rip, e);

                        if (rip->ri_queue_thread == 0) {
                                /*
                                 * Start a thread if one isn't already
                                 * running.
                                 */
                                fmri = safe_strdup(e->rpe_inst);
                                rip->ri_queue_thread =  startd_thread_create(
                                    restarter_process_events, (void *)fmri);
                        } else {
                                /*
                                 * Signal the existing thread that there's
                                 * a new event.
                                 */
                                (void) pthread_cond_broadcast(
                                    &rip->ri_queue_cv);
                        }

                        MUTEX_UNLOCK(&rip->ri_queue_lock);
nolookup:
                        restarter_event_release(e);

                        MUTEX_LOCK(&ru->restarter_update_lock);
                }
        }
}

static restarter_inst_t *
contract_to_inst(ctid_t ctid)
{
        restarter_inst_t *inst;
        int id;

        id = lookup_inst_by_contract(ctid);
        if (id == -1)
                return (NULL);

        inst = inst_lookup_by_id(id);
        if (inst != NULL) {
                /*
                 * Since ri_lock isn't held by the contract id lookup, this
                 * instance may have been restarted and now be in a new
                 * contract, making the old contract no longer valid for this
                 * instance.
                 */
                if (ctid != inst->ri_i.i_primary_ctid) {
                        MUTEX_UNLOCK(&inst->ri_lock);
                        inst = NULL;
                }
        }
        return (inst);
}

/*
 * void contract_action()
 *   Take action on contract events.
 */
static void
contract_action(scf_handle_t *h, restarter_inst_t *inst, ctid_t id,
    uint32_t type)
{
        const char *fmri = inst->ri_i.i_fmri;

        assert(MUTEX_HELD(&inst->ri_lock));

        /*
         * If startd has stopped this contract, there is no need to
         * stop it again.
         */
        if (inst->ri_i.i_primary_ctid > 0 &&
            inst->ri_i.i_primary_ctid_stopped)
                return;

        if ((type & (CT_PR_EV_EMPTY | CT_PR_EV_CORE | CT_PR_EV_SIGNAL
            | CT_PR_EV_HWERR)) == 0) {
                /*
                 * There shouldn't be other events, since that's not how we set
                 * the terms. Thus, just log an error and drive on.
                 */
                log_framework(LOG_NOTICE,
                    "%s: contract %ld received unexpected critical event "
                    "(%d)\n", fmri, id, type);
                return;
        }

        assert(instance_in_transition(inst) == 0);

        if (instance_is_wait_style(inst)) {
                /*
                 * We ignore all events; if they impact the
                 * process we're monitoring, then the
                 * wait_thread will stop the instance.
                 */
                log_framework(LOG_DEBUG,
                    "%s: ignoring contract event on wait-style service\n",
                    fmri);
        } else {
                /*
                 * A CT_PR_EV_EMPTY event is an RSTOP_EXIT request.
                 */
                switch (type) {
                case CT_PR_EV_EMPTY:
                        (void) stop_instance(h, inst, RSTOP_EXIT);
                        break;
                case CT_PR_EV_CORE:
                        (void) stop_instance(h, inst, RSTOP_CORE);
                        break;
                case CT_PR_EV_SIGNAL:
                        (void) stop_instance(h, inst, RSTOP_SIGNAL);
                        break;
                case CT_PR_EV_HWERR:
                        (void) stop_instance(h, inst, RSTOP_HWERR);
                        break;
                }
        }
}

/*
 * void *restarter_contract_event_thread(void *)
 *   Listens to the process contract bundle for critical events, taking action
 *   on events from contracts we know we are responsible for.
 */
/*ARGSUSED*/
static void *
restarter_contracts_event_thread(void *unused)
{
        int fd, err;
        scf_handle_t *local_handle;

        (void) pthread_setname_np(pthread_self(), "restarter_contracts_event");

        /*
         * Await graph load completion.  That is, stop here, until we've scanned
         * the repository for contract - instance associations.
         */
        MUTEX_LOCK(&st->st_load_lock);
        while (!(st->st_load_complete && st->st_load_instances == 0))
                (void) pthread_cond_wait(&st->st_load_cv, &st->st_load_lock);
        MUTEX_UNLOCK(&st->st_load_lock);

        /*
         * This is a new thread, and thus, gets its own handle
         * to the repository.
         */
        if ((local_handle = libscf_handle_create_bound(SCF_VERSION)) == NULL)
                uu_die("Unable to bind a new repository handle: %s\n",
                    scf_strerror(scf_error()));

        fd = open64(CTFS_ROOT "/process/pbundle", O_RDONLY);
        if (fd == -1)
                uu_die("process bundle open failed");

        /*
         * Make sure we get all events (including those generated by configd
         * before this thread was started).
         */
        err = ct_event_reset(fd);
        assert(err == 0);

        for (;;) {
                int efd, sfd;
                ct_evthdl_t ev;
                uint32_t type;
                ctevid_t evid;
                ct_stathdl_t status;
                ctid_t ctid;
                restarter_inst_t *inst;
                uint64_t cookie;

                if (err = ct_event_read_critical(fd, &ev)) {
                        log_error(LOG_WARNING,
                            "Error reading next contract event: %s",
                            strerror(err));
                        continue;
                }

                evid = ct_event_get_evid(ev);
                ctid = ct_event_get_ctid(ev);
                type = ct_event_get_type(ev);

                /* Fetch cookie. */
                if ((sfd = contract_open(ctid, "process", "status", O_RDONLY))
                    < 0) {
                        ct_event_free(ev);
                        continue;
                }

                if (err = ct_status_read(sfd, CTD_COMMON, &status)) {
                        log_framework(LOG_WARNING, "Could not get status for "
                            "contract %ld: %s\n", ctid, strerror(err));

                        startd_close(sfd);
                        ct_event_free(ev);
                        continue;
                }

                cookie = ct_status_get_cookie(status);

                log_framework(LOG_DEBUG, "Received event %d for ctid %ld "
                    "cookie %lld\n", type, ctid, cookie);

                ct_status_free(status);

                startd_close(sfd);

                /*
                 * svc.configd(8) restart handling performed by the
                 * fork_configd_thread.  We don't acknowledge, as that thread
                 * will do so.
                 */
                if (cookie == CONFIGD_COOKIE) {
                        ct_event_free(ev);
                        continue;
                }

                inst = NULL;
                if (storing_contract != 0 &&
                    (inst = contract_to_inst(ctid)) == NULL) {
                        /*
                         * This can happen for two reasons:
                         * - method_run() has not yet stored the
                         *    the contract into the internal hash table.
                         * - we receive an EMPTY event for an abandoned
                         *    contract.
                         * If there is any contract in the process of
                         * being stored into the hash table then re-read
                         * the event later.
                         */
                        log_framework(LOG_DEBUG,
                            "Reset event %d for unknown "
                            "contract id %ld\n", type, ctid);

                        /* don't go too fast */
                        (void) poll(NULL, 0, 100);

                        (void) ct_event_reset(fd);
                        ct_event_free(ev);
                        continue;
                }

                /*
                 * Do not call contract_to_inst() again if first
                 * call succeeded.
                 */
                if (inst == NULL)
                        inst = contract_to_inst(ctid);
                if (inst == NULL) {
                        /*
                         * This can happen if we receive an EMPTY
                         * event for an abandoned contract.
                         */
                        log_framework(LOG_DEBUG,
                            "Received event %d for unknown contract id "
                            "%ld\n", type, ctid);
                } else {
                        log_framework(LOG_DEBUG,
                            "Received event %d for contract id "
                            "%ld (%s)\n", type, ctid,
                            inst->ri_i.i_fmri);

                        contract_action(local_handle, inst, ctid, type);

                        MUTEX_UNLOCK(&inst->ri_lock);
                }

                efd = contract_open(ct_event_get_ctid(ev), "process", "ctl",
                    O_WRONLY);
                if (efd != -1) {
                        (void) ct_ctl_ack(efd, evid);
                        startd_close(efd);
                }

                ct_event_free(ev);

        }

        /*NOTREACHED*/
        return (NULL);
}

/*
 * Timeout queue, processed by restarter_timeouts_event_thread().
 */
timeout_queue_t *timeouts;
static uu_list_pool_t *timeout_pool;

typedef struct timeout_update {
        pthread_mutex_t         tu_lock;
        pthread_cond_t          tu_cv;
        int                     tu_wakeup;
} timeout_update_t;

timeout_update_t *tu;

static const char *timeout_ovr_svcs[] = {
        "svc:/system/manifest-import:default",
        "svc:/network/initial:default",
        "svc:/network/service:default",
        "svc:/system/rmtmpfiles:default",
        "svc:/network/loopback:default",
        "svc:/network/physical:default",
        "svc:/system/device/local:default",
        "svc:/system/filesystem/usr:default",
        "svc:/system/filesystem/minimal:default",
        "svc:/system/filesystem/local:default",
        NULL
};

int
is_timeout_ovr(restarter_inst_t *inst)
{
        int i;

        for (i = 0; timeout_ovr_svcs[i] != NULL; ++i) {
                if (strcmp(inst->ri_i.i_fmri, timeout_ovr_svcs[i]) == 0) {
                        log_instance(inst, B_TRUE, "Timeout override by "
                            "svc.startd.  Using infinite timeout.");
                        return (1);
                }
        }

        return (0);
}

/*ARGSUSED*/
static int
timeout_compare(const void *lc_arg, const void *rc_arg, void *private)
{
        hrtime_t t1 = ((const timeout_entry_t *)lc_arg)->te_timeout;
        hrtime_t t2 = ((const timeout_entry_t *)rc_arg)->te_timeout;

        if (t1 > t2)
                return (1);
        else if (t1 < t2)
                return (-1);
        return (0);
}

void
timeout_init()
{
        timeouts = startd_zalloc(sizeof (timeout_queue_t));

        (void) pthread_mutex_init(&timeouts->tq_lock, &mutex_attrs);

        timeout_pool = startd_list_pool_create("timeouts",
            sizeof (timeout_entry_t), offsetof(timeout_entry_t, te_link),
            timeout_compare, UU_LIST_POOL_DEBUG);
        assert(timeout_pool != NULL);

        timeouts->tq_list = startd_list_create(timeout_pool,
            timeouts, UU_LIST_SORTED);
        assert(timeouts->tq_list != NULL);

        tu = startd_zalloc(sizeof (timeout_update_t));
        (void) pthread_cond_init(&tu->tu_cv, NULL);
        (void) pthread_mutex_init(&tu->tu_lock, &mutex_attrs);
}

void
timeout_insert(restarter_inst_t *inst, ctid_t cid, uint64_t timeout_sec)
{
        hrtime_t now, timeout;
        timeout_entry_t *entry;
        uu_list_index_t idx;

        assert(MUTEX_HELD(&inst->ri_lock));

        now = gethrtime();

        /*
         * If we overflow LLONG_MAX, we're never timing out anyways, so
         * just return.
         */
        if (timeout_sec >= (LLONG_MAX - now) / 1000000000LL) {
                log_instance(inst, B_TRUE, "timeout_seconds too large, "
                    "treating as infinite.");
                return;
        }

        /* hrtime is in nanoseconds. Convert timeout_sec. */
        timeout = now + (timeout_sec * 1000000000LL);

        entry = startd_alloc(sizeof (timeout_entry_t));
        entry->te_timeout = timeout;
        entry->te_ctid = cid;
        entry->te_fmri = safe_strdup(inst->ri_i.i_fmri);
        entry->te_logstem = safe_strdup(inst->ri_logstem);
        entry->te_fired = 0;
        /* Insert the calculated timeout time onto the queue. */
        MUTEX_LOCK(&timeouts->tq_lock);
        (void) uu_list_find(timeouts->tq_list, entry, NULL, &idx);
        uu_list_node_init(entry, &entry->te_link, timeout_pool);
        uu_list_insert(timeouts->tq_list, entry, idx);
        MUTEX_UNLOCK(&timeouts->tq_lock);

        assert(inst->ri_timeout == NULL);
        inst->ri_timeout = entry;

        MUTEX_LOCK(&tu->tu_lock);
        tu->tu_wakeup = 1;
        (void) pthread_cond_broadcast(&tu->tu_cv);
        MUTEX_UNLOCK(&tu->tu_lock);
}


void
timeout_remove(restarter_inst_t *inst, ctid_t cid)
{
        assert(MUTEX_HELD(&inst->ri_lock));

        if (inst->ri_timeout == NULL)
                return;

        assert(inst->ri_timeout->te_ctid == cid);

        MUTEX_LOCK(&timeouts->tq_lock);
        uu_list_remove(timeouts->tq_list, inst->ri_timeout);
        MUTEX_UNLOCK(&timeouts->tq_lock);

        free(inst->ri_timeout->te_fmri);
        free(inst->ri_timeout->te_logstem);
        startd_free(inst->ri_timeout, sizeof (timeout_entry_t));
        inst->ri_timeout = NULL;
}

static int
timeout_now()
{
        timeout_entry_t *e;
        hrtime_t now;
        int ret;

        now = gethrtime();

        /*
         * Walk through the (sorted) timeouts list.  While the timeout
         * at the head of the list is <= the current time, kill the
         * method.
         */
        MUTEX_LOCK(&timeouts->tq_lock);

        for (e = uu_list_first(timeouts->tq_list);
            e != NULL && e->te_timeout <= now;
            e = uu_list_next(timeouts->tq_list, e)) {
                log_framework(LOG_WARNING, "%s: Method or service exit timed "
                    "out.  Killing contract %ld.\n", e->te_fmri, e->te_ctid);
                log_instance_fmri(e->te_fmri, e->te_logstem, B_TRUE,
                    "Method or service exit timed out.  Killing contract %ld.",
                    e->te_ctid);
                e->te_fired = 1;
                (void) contract_kill(e->te_ctid, SIGKILL, e->te_fmri);
        }

        if (uu_list_numnodes(timeouts->tq_list) > 0)
                ret = 0;
        else
                ret = -1;

        MUTEX_UNLOCK(&timeouts->tq_lock);

        return (ret);
}

/*
 * void *restarter_timeouts_event_thread(void *)
 *   Responsible for monitoring the method timeouts.  This thread must
 *   be started before any methods are called.
 */
/*ARGSUSED*/
static void *
restarter_timeouts_event_thread(void *unused)
{
        /*
         * Timeouts are entered on a priority queue, which is processed by
         * this thread.  As timeouts are specified in seconds, we'll do
         * the necessary processing every second, as long as the queue
         * is not empty.
         */

        (void) pthread_setname_np(pthread_self(), "restarter_timeouts_event");

        /*CONSTCOND*/
        while (1) {
                /*
                 * As long as the timeout list isn't empty, process it
                 * every second.
                 */
                if (timeout_now() == 0) {
                        (void) sleep(1);
                        continue;
                }

                /* The list is empty, wait until we have more timeouts. */
                MUTEX_LOCK(&tu->tu_lock);

                while (tu->tu_wakeup == 0)
                        (void) pthread_cond_wait(&tu->tu_cv, &tu->tu_lock);

                tu->tu_wakeup = 0;
                MUTEX_UNLOCK(&tu->tu_lock);
        }

        return (NULL);
}

void
restarter_start()
{
        (void) startd_thread_create(restarter_timeouts_event_thread, NULL);
        (void) startd_thread_create(restarter_event_thread, NULL);
        (void) startd_thread_create(restarter_contracts_event_thread, NULL);
        (void) startd_thread_create(wait_thread, NULL);
}


void
restarter_init()
{
        restarter_instance_pool = startd_list_pool_create("restarter_instances",
            sizeof (restarter_inst_t), offsetof(restarter_inst_t,
            ri_link), restarter_instance_compare, UU_LIST_POOL_DEBUG);
        (void) memset(&instance_list, 0, sizeof (instance_list));

        (void) pthread_mutex_init(&instance_list.ril_lock, &mutex_attrs);
        instance_list.ril_instance_list = startd_list_create(
            restarter_instance_pool, &instance_list, UU_LIST_SORTED);

        restarter_queue_pool = startd_list_pool_create(
            "restarter_instance_queue", sizeof (restarter_instance_qentry_t),
            offsetof(restarter_instance_qentry_t,  riq_link), NULL,
            UU_LIST_POOL_DEBUG);

        contract_list_pool = startd_list_pool_create(
            "contract_list", sizeof (contract_entry_t),
            offsetof(contract_entry_t,  ce_link), NULL,
            UU_LIST_POOL_DEBUG);
        contract_hash_init();

        log_framework(LOG_DEBUG, "Initialized restarter\n");
}