root/usr/src/cmd/rcm_daemon/common/rcm_lock.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include "rcm_impl.h"
#include "rcm_module.h"

/*
 * Global locks
 */
mutex_t rcm_req_lock;   /* protects global dr & info request list */

/*
 * Daemon state file
 */
static int state_fd;
#define RCM_STATE_FILE  "/var/run/rcm_daemon_state"
#define N_REQ_CHUNK     10      /* grow 10 entries at a time */

/*
 * Daemon timeout value
 */
#define RCM_DAEMON_TIMEOUT      300     /* 5 minutes idle time */

/*
 * Struct for a list of outstanding rcm requests
 */
typedef struct {
        int     seq_num;                /* sequence number of request */
        int     state;                  /* current state */
        pid_t   pid;                    /* pid of initiator */
        uint_t  flag;                   /* request flags */
        int     type;                   /* resource(device) type */
        timespec_t interval;            /* suspend interval */
        char    device[MAXPATHLEN];     /* name of device or resource */
} req_t;

typedef struct {
        int     n_req;
        int     n_req_max;      /* number of req_t's to follow */
        int     n_seq_max;      /* last sequence number */
        int     idle_timeout;   /* persist idle timeout value */
        req_t   req[1];
        /* more req_t follows */
} req_list_t;

static req_list_t *dr_req_list;
static req_list_t *info_req_list;

static const char *locked_info = "DR operation in progress";
static const char *locked_err = "Resource is busy";

static int rcmd_get_state();
static void add_to_polling_list(pid_t);
static void remove_from_polling_list(pid_t);

void start_polling_thread();
static void stop_polling_thread();

/*
 * Initialize request lists required for locking
 */
void
rcmd_lock_init(void)
{
        int size;
        struct stat fbuf;

        /*
         * Start info list with one slot, then grow on demand.
         */
        info_req_list = s_calloc(1, sizeof (req_list_t));
        info_req_list->n_req_max = 1;

        /*
         * Open daemon state file and map in contents
         */
        state_fd = open(RCM_STATE_FILE, O_CREAT|O_RDWR, 0600);
        if (state_fd == -1) {
                rcm_log_message(RCM_ERROR, gettext("cannot open %s: %s\n"),
                    RCM_STATE_FILE, strerror(errno));
                rcmd_exit(errno);
        }

        if (fstat(state_fd, &fbuf) != 0) {
                rcm_log_message(RCM_ERROR, gettext("cannot stat %s: %s\n"),
                    RCM_STATE_FILE, strerror(errno));
                rcmd_exit(errno);
        }

        size = fbuf.st_size;
        if (size == 0) {
                size = sizeof (req_list_t);
                if (ftruncate(state_fd, size) != 0) {
                        rcm_log_message(RCM_ERROR,
                            gettext("cannot truncate %s: %s\n"),
                            RCM_STATE_FILE, strerror(errno));
                        rcmd_exit(errno);
                }
        }

        /*LINTED*/
        dr_req_list = (req_list_t *)mmap(NULL, size, PROT_READ|PROT_WRITE,
            MAP_SHARED, state_fd, 0);
        if (dr_req_list == MAP_FAILED) {
                rcm_log_message(RCM_ERROR, gettext("cannot mmap %s: %s\n"),
                    RCM_STATE_FILE, strerror(errno));
                rcmd_exit(errno);
        }

        /*
         * Initial size is one entry
         */
        if (dr_req_list->n_req_max == 0) {
                dr_req_list->n_req_max = 1;
                (void) fsync(state_fd);
                return;
        }

        rcm_log_message(RCM_DEBUG, "n_req = %d, n_req_max = %d\n",
            dr_req_list->n_req, dr_req_list->n_req_max);

        /*
         * Recover the daemon state
         */
        clean_dr_list();
}

/*
 * Get a unique sequence number--to be called with rcm_req_lock held.
 */
static int
get_seq_number()
{
        int number;

        if (dr_req_list == NULL)
                return (0);

        dr_req_list->n_seq_max++;
        number  = (dr_req_list->n_seq_max << SEQ_NUM_SHIFT);
        (void) fsync(state_fd);

        return (number);
}

/*
 * Find entry in list with the same resource name and sequence number.
 * If seq_num == -1, no seq_num matching is required.
 */
static req_t *
find_req_entry(char *device, uint_t flag, int seq_num, req_list_t *list)
{
        int i;

        /*
         * Look for entry with the same resource and seq_num.
         * Also match RCM_FILESYS field in flag.
         */
        for (i = 0; i < list->n_req_max; i++) {
                if (list->req[i].state == RCM_STATE_REMOVE)
                        /* stale entry */
                        continue;
                /*
                 * We need to distiguish a file system root from the directory
                 * it is mounted on.
                 *
                 * Applications are not aware of any difference between the
                 * two, but the system keeps track of it internally by
                 * checking for mount points while traversing file path.
                 * In a similar spirit, RCM is keeping this difference as
                 * an implementation detail.
                 */
                if ((strcmp(device, list->req[i].device) != 0) ||
                    (list->req[i].flag & RCM_FILESYS) != (flag & RCM_FILESYS))
                        /* different resource */
                        continue;

                if ((seq_num != -1) && ((seq_num >> SEQ_NUM_SHIFT) !=
                    (list->req[i].seq_num >> SEQ_NUM_SHIFT)))
                        /* different base seqnum */
                        continue;

                return (&list->req[i]);
        }

        return (NULL);
}

/*
 * Get the next empty req_t entry. If no entry exists, grow the list.
 */
static req_t *
get_req_entry(req_list_t **listp)
{
        int i;
        int n_req = (*listp)->n_req;
        int n_req_max = (*listp)->n_req_max;

        /*
         * If the list is full, grow the list and return the first
         * entry in the new portion.
         */
        if (n_req == n_req_max) {
                int newsize;

                n_req_max += N_REQ_CHUNK;
                newsize = sizeof (req_list_t) + (n_req_max - 1) *
                    sizeof (req_t);

                if (listp == &info_req_list) {
                        *listp = s_realloc(*listp, newsize);
                } else if (ftruncate(state_fd, newsize) != 0) {
                        rcm_log_message(RCM_ERROR,
                            gettext("cannot truncate %s: %s\n"),
                            RCM_STATE_FILE, strerror(errno));
                        rcmd_exit(errno);
                /*LINTED*/
                } else if ((*listp = (req_list_t *)mmap(NULL, newsize,
                    PROT_READ|PROT_WRITE, MAP_SHARED, state_fd, 0)) ==
                    MAP_FAILED) {
                        rcm_log_message(RCM_ERROR,
                            gettext("cannot mmap %s: %s\n"),
                            RCM_STATE_FILE, strerror(errno));
                        rcmd_exit(errno);
                }

                /* Initialize the new entries */
                for (i = (*listp)->n_req_max; i < n_req_max; i++) {
                        (*listp)->req[i].state = RCM_STATE_REMOVE;
                        (void) strcpy((*listp)->req[i].device, "");
                }

                (*listp)->n_req_max = n_req_max;
                (*listp)->n_req++;
                return (&(*listp)->req[n_req]);
        }

        /*
         * List contains empty slots, find it.
         */
        for (i = 0; i < n_req_max; i++) {
                if (((*listp)->req[i].device[0] == '\0') ||
                    ((*listp)->req[i].state == RCM_STATE_REMOVE)) {
                        break;
                }
        }

        assert(i < n_req_max);  /* empty slot must exist */

        (*listp)->n_req++;
        return (&(*listp)->req[i]);
}

/*
 * When one resource depends on multiple resources, it's possible that
 * rcm_get_info can be called multiple times on the resource, resulting
 * in duplicate information. By assigning a unique sequence number to
 * each rcm_get_info operation, this duplication can be eliminated.
 *
 * Insert a dr entry in info_req_list
 */
int
info_req_add(char *rsrcname, uint_t flag, int seq_num)
{
        int error = 0;
        char *device;
        req_t *req;

        rcm_log_message(RCM_TRACE2, "info_req_add(%s, %d)\n",
            rsrcname, seq_num);

        device = resolve_name(rsrcname);
        (void) mutex_lock(&rcm_req_lock);

        /*
         * Look for entry with the same resource and seq_num.
         * If it exists, we return an error so that such
         * information is not gathered more than once.
         */
        if (find_req_entry(device, flag, seq_num, info_req_list) != NULL) {
                rcm_log_message(RCM_DEBUG, "getinfo cycle: %s %d \n",
                    device, seq_num);
                error = -1;
                goto out;
        }

        /*
         * Get empty entry and fill in seq_num and device.
         */
        req = get_req_entry(&info_req_list);
        req->seq_num = seq_num;
        req->state = RCM_STATE_ONLINE;  /* mark that the entry is in use */
        req->flag = flag;
        (void) strcpy(req->device, device);

out:
        (void) mutex_unlock(&rcm_req_lock);
        free(device);

        return (error);
}

/*
 * Remove all entries associated with seq_num from info_req_list
 */
void
info_req_remove(int seq_num)
{
        int i;

        rcm_log_message(RCM_TRACE3, "info_req_remove(%d)\n", seq_num);

        seq_num >>= SEQ_NUM_SHIFT;
        (void) mutex_lock(&rcm_req_lock);

        /* remove all entries with seq_num */
        for (i = 0; i < info_req_list->n_req_max; i++) {
                if (info_req_list->req[i].state == RCM_STATE_REMOVE)
                        continue;

                if ((info_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != seq_num)
                        continue;

                info_req_list->req[i].state = RCM_STATE_REMOVE;
                info_req_list->n_req--;
        }

        /*
         * We don't shrink the info_req_list size for now.
         */
        (void) mutex_unlock(&rcm_req_lock);
}

/*
 * Checking lock conflicts. There is a conflict if:
 * - attempt to DR a node when either its ancester or descendent
 *      is in the process of DR
 * - attempt to register for a node when its ancester is locked for DR
 */
static int
check_lock(char *device, uint_t flag, int cflag, rcm_info_t **info)
{
        int i, ret = RCM_SUCCESS;

        if (info)
                *info = NULL;

        /*
         * During daemon initialization, don't check locks
         */
        if (dr_req_list == NULL)
                return (ret);

        for (i = 0; i < dr_req_list->n_req; i++) {
                req_t *req = &dr_req_list->req[i];
                char *dr_dev = req->device;

                /*
                 * Skip empty entries
                 */
                if ((req->state == RCM_STATE_REMOVE) || (dr_dev[0] == '\0'))
                        continue;

                /*
                 * Make sure that none of the ancestors of dr_dev is
                 * being operated upon.
                 */
                if (EQUAL(device, dr_dev) || DESCENDENT(device, dr_dev)) {
                        /*
                         * An exception to this is the filesystem.
                         * We should allowed a filesystem rooted at a
                         * child directory to be unmounted.
                         */
                        if ((flag & RCM_FILESYS) && (!EQUAL(device, dr_dev) ||
                            ((dr_req_list->req[i].flag & RCM_FILESYS) == 0)))
                                continue;

                        assert(info != 0);

                        add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
                            dr_req_list->req[i].state,
                            dr_req_list->req[i].seq_num, NULL, locked_info,
                            locked_err, NULL, info);
                        ret = RCM_CONFLICT;
                        break;
                }

                if ((cflag == LOCK_FOR_DR) && DESCENDENT(dr_dev, device)) {
                        /*
                         * Check descendents only for DR request.
                         *
                         * Could have multiple descendents doing DR,
                         * we want to find them all.
                         */
                        assert(info != 0);

                        add_busy_rsrc_to_list(dr_dev, dr_req_list->req[i].pid,
                            dr_req_list->req[i].state,
                            dr_req_list->req[i].seq_num, NULL, locked_info,
                            locked_err, NULL, info);
                        ret = RCM_CONFLICT;
                        /* don't break here, need to find all conflicts */
                }
        }

        return (ret);
}

/*
 * Check for lock conflicts for DR operation or client registration
 */
int
rsrc_check_lock_conflicts(char *rsrcname, uint_t flag, int cflag,
    rcm_info_t **info)
{
        int result;
        char *device;

        device = resolve_name(rsrcname);
        result = check_lock(device, flag, cflag, info);
        free(device);

        return (result);
}

static int
transition_state(int state)
{
        /*
         * If the resource state is in transition, ask caller to
         * try again.
         */
        switch (state) {
        case RCM_STATE_OFFLINING:
        case RCM_STATE_SUSPENDING:
        case RCM_STATE_RESUMING:
        case RCM_STATE_ONLINING:
        case RCM_STATE_REMOVING:

                return (1);

        default:
                /*FALLTHROUGH*/
                break;
        }
        return (0);
}

/*
 * Update a dr entry in dr_req_list
 */
/*ARGSUSED*/
static int
dr_req_update_entry(char *device, pid_t pid, uint_t flag, int state,
    int seq_num, timespec_t *interval, rcm_info_t **infop)
{
        req_t *req;

        /*
         * Find request entry. If not found, return RCM_FAILURE
         */
        req = find_req_entry(device, flag, -1, dr_req_list);

        if (req == NULL) {
                switch (state) {
                case RCM_STATE_OFFLINE_QUERYING:
                case RCM_STATE_SUSPEND_QUERYING:
                case RCM_STATE_OFFLINING:
                case RCM_STATE_SUSPENDING:
                        /* could be re-do operation, no error message */
                        break;

                default:
                        rcm_log_message(RCM_DEBUG,
                            "update non-existing resource %s\n", device);
                }
                return (RCM_FAILURE);
        }

        /*
         * During initialization, update is unconditional (forced)
         * in order to bring the daemon up in a sane state.
         */
        if (rcmd_get_state() == RCMD_INIT)
                goto update;

        /*
         * Don't allow update with mismatched initiator pid. This could happen
         * as part of normal operation.
         */
        if (pid != req->pid) {
                rcm_log_message(RCM_INFO,
                    gettext("mismatched dr initiator pid: %ld %ld\n"),
                    req->pid, pid);
                goto failure;
        }

        rcm_log_message(RCM_TRACE4,
            "dr_req_update_entry: state=%d, device=%s\n",
            req->state, req->device);

        /*
         * Check that the state transition is valid
         */
        switch (state) {
        case RCM_STATE_OFFLINE_QUERYING:
        case RCM_STATE_OFFLINING:
                /*
                 * This is the case of re-offlining, which applies only
                 * if a previous attempt failed.
                 */
                if ((req->state != RCM_STATE_OFFLINE_FAIL) &&
                    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
                    (req->state != RCM_STATE_OFFLINE_QUERY) &&
                    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
                    (req->state != RCM_STATE_OFFLINE)) {
                        rcm_log_message(RCM_WARNING,
                            gettext("%s: invalid offlining from state %d\n"),
                            device, req->state);
                        goto failure;
                }
                break;

        case RCM_STATE_SUSPEND_QUERYING:
        case RCM_STATE_SUSPENDING:
                /*
                 * This is the case of re-suspending, which applies only
                 * if a previous attempt failed.
                 */
                if ((req->state != RCM_STATE_SUSPEND_FAIL) &&
                    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
                    (req->state != RCM_STATE_SUSPEND_QUERY) &&
                    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
                    (req->state != RCM_STATE_SUSPEND)) {
                        rcm_log_message(RCM_WARNING,
                            gettext("%s: invalid suspending from state %d\n"),
                            device, req->state);
                        goto failure;
                }
                break;

        case RCM_STATE_RESUMING:
                if ((req->state != RCM_STATE_SUSPEND) &&
                    (req->state != RCM_STATE_SUSPEND_QUERYING) &&
                    (req->state != RCM_STATE_SUSPEND_QUERY) &&
                    (req->state != RCM_STATE_SUSPEND_QUERY_FAIL) &&
                    (req->state != RCM_STATE_SUSPEND_FAIL)) {
                        rcm_log_message(RCM_DEBUG,
                            "%s: invalid resuming from state %d\n",
                            device, req->state);
                        goto failure;
                }
                break;

        case RCM_STATE_ONLINING:
                if ((req->state != RCM_STATE_OFFLINE) &&
                    (req->state != RCM_STATE_OFFLINE_QUERYING) &&
                    (req->state != RCM_STATE_OFFLINE_QUERY) &&
                    (req->state != RCM_STATE_OFFLINE_QUERY_FAIL) &&
                    (req->state != RCM_STATE_OFFLINE_FAIL)) {
                        rcm_log_message(RCM_INFO,
                            gettext("%s: invalid onlining from state %d\n"),
                            device, req->state);
                        goto failure;
                }
                break;

        case RCM_STATE_REMOVING:
                if ((req->state != RCM_STATE_OFFLINE) &&
                    (req->state != RCM_STATE_OFFLINE_FAIL)) {
                        rcm_log_message(RCM_INFO,
                            gettext("%s: invalid removing from state %d\n"),
                            device, req->state);
                        goto failure;
                }
                break;

        case RCM_STATE_SUSPEND_FAIL:
                assert(req->state == RCM_STATE_SUSPENDING);
                break;

        case RCM_STATE_OFFLINE_FAIL:
                assert(req->state == RCM_STATE_OFFLINING);
                break;

        case RCM_STATE_SUSPEND:
                assert(req->state == RCM_STATE_SUSPENDING);
                break;

        case RCM_STATE_OFFLINE:
                assert(req->state == RCM_STATE_OFFLINING);
                break;

        case RCM_STATE_ONLINE:
                assert((req->state == RCM_STATE_RESUMING) ||
                    (req->state == RCM_STATE_ONLINING));
                break;

        default:        /* shouldn't be here */
                rcm_log_message(RCM_ERROR,
                    gettext("invalid update to dr state: %d\n"), state);
                return (RCM_FAILURE);
        }

update:
        /*
         * update the state, interval, and sequence number; sync state file
         */
        req->state = state;
        req->seq_num = seq_num;

        if (interval)
                req->interval = *interval;
        else
                bzero(&req->interval, sizeof (timespec_t));

        (void) fsync(state_fd);
        return (RCM_SUCCESS);

failure:
        if (infop != NULL) {
                add_busy_rsrc_to_list(req->device, req->pid, req->state,
                    req->seq_num, NULL, locked_info, locked_err, NULL, infop);
        }

        /*
         * A request may be left in a transition state because the operator
         * typed ctrl-C. In this case, the daemon thread continues to run
         * and will eventually put the state in a non-transitional state.
         *
         * To be safe, we return EAGAIN to allow librcm to loop and retry.
         * If we are called from a module, loop & retry could result in a
         * deadlock. The called will check for this case and turn EAGAIN
         * into RCM_CONFLICT.
         */
        if (transition_state(req->state)) {
                return (EAGAIN);
        }

        return (RCM_CONFLICT);
}

/*
 * Insert a dr entry in dr_req_list
 */
int
dr_req_add(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
    timespec_t *interval, rcm_info_t **info)
{
        int error;
        char *device;
        req_t *req;

        rcm_log_message(RCM_TRACE3, "dr_req_add(%s, %ld, 0x%x, %d, %d, %p)\n",
            rsrcname, pid, flag, state, seq_num, (void *)info);

        device = resolve_name(rsrcname);
        if (device == NULL)
                return (EINVAL);

        (void) mutex_lock(&rcm_req_lock);

        /*
         * In the re-offline/suspend case, attempt to update dr request.
         *
         * If this succeeds, return success;
         * If this fails because of a conflict, return error;
         * If this this fails because no entry exists, add a new entry.
         */
        error = dr_req_update_entry(device, pid, flag, state, seq_num, interval,
            info);

        switch (error) {
        case RCM_FAILURE:
                /* proceed to add a new entry */
                break;

        case RCM_CONFLICT:
        case RCM_SUCCESS:
        case EAGAIN:
        default:
                goto out;
        }

        /*
         * Check for lock conflicts
         */
        error = check_lock(device, flag, LOCK_FOR_DR, info);
        if (error != RCM_SUCCESS) {
                error = RCM_CONFLICT;
                goto out;
        }

        /*
         * Get empty request entry, fill in values and sync state file
         */
        req = get_req_entry(&dr_req_list);

        req->seq_num = seq_num;
        req->pid = pid;
        req->flag = flag;
        req->state = state;
        req->type = rsrc_get_type(device);
        (void) strcpy(req->device, device);

        /* cache interval for failure recovery */
        if (interval)
                req->interval = *interval;
        else
                bzero(&req->interval, sizeof (timespec_t));

        (void) fsync(state_fd);

        /*
         * Add initiator pid to polling list
         */
        add_to_polling_list(req->pid);

out:
        (void) mutex_unlock(&rcm_req_lock);
        free(device);

        return (error);
}

/*
 * Update a dr entry in dr_req_list
 */
/*ARGSUSED*/
int
dr_req_update(char *rsrcname, pid_t pid, uint_t flag, int state, int seq_num,
    rcm_info_t **info)
{
        int error;
        char *device = resolve_name(rsrcname);

        rcm_log_message(RCM_TRACE3, "dr_req_update(%s, %ld, 0x%x, %d, %d)\n",
            rsrcname, pid, flag, state, seq_num);

        (void) mutex_lock(&rcm_req_lock);
        error = dr_req_update_entry(device, pid, flag, state, seq_num, NULL,
            info);
        (void) mutex_unlock(&rcm_req_lock);
        free(device);

        return (error);
}

/*
 * This function scans the DR request list for the next, non-removed
 * entry that is part of the specified sequence.  The 'device' name
 * of the entry is copied into the provided 'rsrc' buffer.
 *
 * The 'rsrc' buffer is required because the DR request list is only
 * locked during the duration of this lookup.  Giving a direct pointer
 * to something in the list would be unsafe.
 */
int
dr_req_lookup(int seq_num, char *rsrc)
{
        int     i;
        int     len;
        int     base = (seq_num >> SEQ_NUM_SHIFT);
        int     retval = RCM_FAILURE;

        if (rsrc == NULL) {
                return (RCM_FAILURE);
        }

        (void) mutex_lock(&rcm_req_lock);

        for (i = 0; i < dr_req_list->n_req_max; i++) {

                /* Skip removed or non-matching entries */
                if ((dr_req_list->req[i].state == RCM_STATE_REMOVE) ||
                    ((dr_req_list->req[i].seq_num >> SEQ_NUM_SHIFT) != base)) {
                        continue;
                }

                /* Copy the next-matching 'device' name into 'rsrc' */
                len = strlcpy(rsrc, dr_req_list->req[i].device, MAXPATHLEN);
                if (len < MAXPATHLEN) {
                        retval = RCM_SUCCESS;
                }
                break;
        }

        (void) mutex_unlock(&rcm_req_lock);

        return (retval);
}

/*
 * Remove a dr entry in dr_req_list
 */
void
dr_req_remove(char *rsrcname, uint_t flag)
{
        req_t *req;
        char *device = resolve_name(rsrcname);

        rcm_log_message(RCM_TRACE3, "dr_req_remove(%s)\n", rsrcname);

        (void) mutex_lock(&rcm_req_lock);

        /* find entry */
        req = find_req_entry(device, flag, -1, dr_req_list);
        free(device);

        if (req == NULL) {
                (void) mutex_unlock(&rcm_req_lock);
                rcm_log_message(RCM_WARNING,
                    gettext("dr_req entry %s not found\n"), rsrcname);
                return;
        }

        req->state = RCM_STATE_REMOVE;
        dr_req_list->n_req--;
        (void) fsync(state_fd);

        /*
         * remove pid from polling list
         */
        remove_from_polling_list(req->pid);

        /*
         * We don't shrink the dr_req_list size for now.
         * Shouldn't cause big memory leaks.
         */
        (void) mutex_unlock(&rcm_req_lock);
}

/*
 * Return the list of ongoing dr operation requests
 */
rcm_info_t *
rsrc_dr_info()
{
        int i;
        rcm_info_t *info;
        rcm_info_t *result = NULL;
        char *rsrc;
        int len;

        rcm_log_message(RCM_TRACE2, "rsrc_dr_info()\n");

        (void) mutex_lock(&rcm_req_lock);
        for (i = 0; i < dr_req_list->n_req_max; i++) {
                if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
                        continue;

                if (dr_req_list->req[i].device[0] == '\0')
                        continue;

                if (dr_req_list->req[i].flag & RCM_FILESYS) {
                        len = strlen(dr_req_list->req[i].device) + 5;
                        rsrc = s_malloc(len);
                        (void) snprintf(rsrc, len, "%s(fs)",
                            dr_req_list->req[i].device);
                } else {
                        rsrc = s_strdup(dr_req_list->req[i].device);
                }

                info = s_calloc(1, sizeof (*info));
                if (errno = nvlist_alloc(&(info->info), NV_UNIQUE_NAME, 0)) {
                        rcm_log_message(RCM_ERROR,
                            gettext("failed (nvlist_alloc=%s).\n"),
                            strerror(errno));
                        rcmd_exit(errno);
                }

                if (errno = nvlist_add_string(info->info, RCM_RSRCNAME, rsrc)) {
                        rcm_log_message(RCM_ERROR,
                            gettext("failed (nvlist_add=%s).\n"),
                            strerror(errno));
                        rcmd_exit(errno);
                }
                (void) free(rsrc);

                if (errno = nvlist_add_int64(info->info, RCM_CLIENT_ID,
                    dr_req_list->req[i].pid)) {
                        rcm_log_message(RCM_ERROR,
                            gettext("failed (nvlist_add=%s).\n"),
                            strerror(errno));
                        rcmd_exit(errno);
                }

                if (errno = nvlist_add_int32(info->info, RCM_SEQ_NUM,
                    dr_req_list->req[i].seq_num)) {
                        rcm_log_message(RCM_ERROR,
                            gettext("failed (nvlist_add=%s).\n"),
                            strerror(errno));
                        rcmd_exit(errno);
                }

                if (errno = nvlist_add_int32(info->info, RCM_RSRCSTATE,
                    dr_req_list->req[i].state)) {
                        rcm_log_message(RCM_ERROR,
                            gettext("failed (nvlist_add=%s).\n"),
                            strerror(errno));
                        rcmd_exit(errno);
                }

                if (errno = nvlist_add_string(info->info, RCM_CLIENT_INFO,
                    (char *)locked_info)) {
                        rcm_log_message(RCM_ERROR,
                            gettext("failed (nvlist_add=%s).\n"),
                            strerror(errno));
                        rcmd_exit(errno);
                }

                info->next = result;
                result = info;
        }
        (void) mutex_unlock(&rcm_req_lock);

        return (result);
}

/*
 * Eliminate entries whose dr initiator is no longer running
 * and recover daemon state during daemon restart.
 *
 * This routine is called from either during daemon initialization
 * after all modules have registered resources or from the cleanup
 * thread. In either case, it is the only thread running in the
 * daemon.
 */
void
clean_dr_list()
{
        int i;
        struct clean_list {
                struct clean_list *next;
                char *rsrcname;
                pid_t pid;
                int seq_num;
                int state;
                timespec_t interval;
        } *tmp, *list = NULL;
        char *rsrcnames[2];

        rcm_log_message(RCM_TRACE3,
            "clean_dr_list(): look for stale dr initiators\n");

        rsrcnames[1] = NULL;

        /*
         * Make a list of entries to recover. This is necessary because
         * the recovery operation will modify dr_req_list.
         */
        (void) mutex_lock(&rcm_req_lock);
        for (i = 0; i < dr_req_list->n_req_max; i++) {
                /* skip empty entries */
                if (dr_req_list->req[i].state == RCM_STATE_REMOVE)
                        continue;

                if (dr_req_list->req[i].device[0] == '\0')
                        continue;

                /* skip cascade operations */
                if (dr_req_list->req[i].seq_num & SEQ_NUM_MASK)
                        continue;

                /*
                 * In the cleanup case, ignore entries with initiators alive
                 */
                if ((rcmd_get_state() == RCMD_CLEANUP) &&
                    proc_exist(dr_req_list->req[i].pid))
                        continue;

                rcm_log_message(RCM_TRACE1,
                    "found stale entry: %s\n", dr_req_list->req[i].device);

                tmp = s_malloc(sizeof (*tmp));
                tmp->rsrcname = s_strdup(dr_req_list->req[i].device);
                tmp->state = dr_req_list->req[i].state;
                tmp->pid = dr_req_list->req[i].pid;
                tmp->seq_num = dr_req_list->req[i].seq_num;
                tmp->interval = dr_req_list->req[i].interval;
                tmp->next = list;
                list = tmp;
        }
        (void) mutex_unlock(&rcm_req_lock);

        if (list == NULL)
                return;

        /*
         * If everything worked normally, we shouldn't be here.
         * Since we are here, something went wrong, so say something.
         */
        if (rcmd_get_state() == RCMD_INIT) {
                rcm_log_message(RCM_NOTICE, gettext("rcm_daemon died "
                    "unexpectedly, recovering previous daemon state\n"));
        } else {
                rcm_log_message(RCM_INFO, gettext("one or more dr initiator "
                    "died, attempting automatic recovery\n"));
        }

        while (list) {
                tmp = list;
                list = tmp->next;

                switch (tmp->state) {
                case RCM_STATE_OFFLINE_QUERY:
                case RCM_STATE_OFFLINE_QUERY_FAIL:
                        rsrcnames[0] = tmp->rsrcname;
                        if (proc_exist(tmp->pid)) {
                                /* redo */
                                (void) process_resource_offline(rsrcnames,
                                    tmp->pid, RCM_QUERY, tmp->seq_num, NULL);
                        } else {
                                /* undo */
                                (void) notify_resource_online(rsrcnames,
                                    tmp->pid, 0, tmp->seq_num, NULL);
                        }
                        break;

                case RCM_STATE_OFFLINE:
                case RCM_STATE_OFFLINE_FAIL:
                        rsrcnames[0] = tmp->rsrcname;
                        if (proc_exist(tmp->pid)) {
                                /* redo */
                                (void) process_resource_offline(rsrcnames,
                                    tmp->pid, 0, tmp->seq_num, NULL);
                        } else {
                                /* undo */
                                (void) notify_resource_online(rsrcnames,
                                    tmp->pid, 0, tmp->seq_num, NULL);
                        }
                        break;

                case RCM_STATE_SUSPEND_QUERY:
                case RCM_STATE_SUSPEND_QUERY_FAIL:
                        rsrcnames[0] = tmp->rsrcname;
                        if (proc_exist(tmp->pid)) {
                                /* redo */
                                (void) process_resource_suspend(rsrcnames,
                                    tmp->pid, RCM_QUERY, tmp->seq_num,
                                    &tmp->interval, NULL);
                        } else {
                                /* undo */
                                (void) notify_resource_resume(rsrcnames,
                                    tmp->pid, 0, tmp->seq_num, NULL);
                        }
                        break;

                case RCM_STATE_SUSPEND:
                case RCM_STATE_SUSPEND_FAIL:
                        rsrcnames[0] = tmp->rsrcname;
                        if (proc_exist(tmp->pid)) {
                                /* redo */
                                (void) process_resource_suspend(rsrcnames,
                                    tmp->pid, 0, tmp->seq_num, &tmp->interval,
                                    NULL);
                        } else {
                                /* undo */
                                (void) notify_resource_resume(rsrcnames,
                                    tmp->pid, 0, tmp->seq_num, NULL);
                        }
                        break;

                case RCM_STATE_OFFLINING:
                case RCM_STATE_ONLINING:
                        rsrcnames[0] = tmp->rsrcname;
                        (void) notify_resource_online(rsrcnames, tmp->pid, 0,
                            tmp->seq_num, NULL);
                        break;

                case RCM_STATE_SUSPENDING:
                case RCM_STATE_RESUMING:
                        rsrcnames[0] = tmp->rsrcname;
                        (void) notify_resource_resume(rsrcnames, tmp->pid, 0,
                            tmp->seq_num, NULL);
                        break;

                case RCM_STATE_REMOVING:
                        rsrcnames[0] = tmp->rsrcname;
                        (void) notify_resource_remove(rsrcnames, tmp->pid, 0,
                            tmp->seq_num, NULL);
                        break;

                default:
                        rcm_log_message(RCM_WARNING,
                            gettext("%s in unknown state %d\n"),
                            tmp->rsrcname, tmp->state);
                        break;
                }
                free(tmp->rsrcname);
                free(tmp);
        }
}

/*
 * Selected thread blocking based on event type
 */
barrier_t barrier;

/*
 * Change barrier state:
 *      RCMD_INIT - daemon is intializing, only register allowed
 *      RCMD_NORMAL - normal daemon processing
 *      RCMD_CLEANUP - cleanup thread is waiting or running
 */
int
rcmd_get_state()
{
        return (barrier.state);
}

void
rcmd_set_state(int state)
{
        /*
         * The state transition is as follows:
         *      INIT --> NORMAL <---> CLEANUP
         * The implementation favors the cleanup thread
         */

        (void) mutex_lock(&barrier.lock);
        barrier.state = state;

        switch (state) {
        case RCMD_CLEANUP:
                /*
                 * Wait for existing threads to exit
                 */
                barrier.wanted++;
                while (barrier.thr_count != 0)
                        (void) cond_wait(&barrier.cv, &barrier.lock);
                barrier.wanted--;
                barrier.thr_count = -1;
                break;

        case RCMD_INIT:
        case RCMD_NORMAL:
        default:
                if (barrier.thr_count == -1)
                        barrier.thr_count = 0;
                if (barrier.wanted)
                        (void) cond_broadcast(&barrier.cv);
                break;
        }

        (void) mutex_unlock(&barrier.lock);
}

/*
 * Increment daemon thread count
 */
int
rcmd_thr_incr(int cmd)
{
        int seq_num;

        (void) mutex_lock(&barrier.lock);
        /*
         * Set wanted flag
         */
        barrier.wanted++;

        /*
         * Wait till it is safe for daemon to perform the operation
         *
         * NOTE: if a module registers by passing a request to the
         *      client proccess, we may need to allow register
         *      to come through during daemon initialization.
         */
        while (barrier.state != RCMD_NORMAL)
                (void) cond_wait(&barrier.cv, &barrier.lock);

        if ((cmd == CMD_EVENT) ||
            (cmd == CMD_REGISTER) ||
            (cmd == CMD_UNREGISTER)) {
                /*
                 * Event passthru and register ops don't need sequence number
                 */
                seq_num = -1;
        } else {
                /*
                 * Non register operation gets a sequence number
                 */
                seq_num = get_seq_number();
        }
        barrier.wanted--;
        barrier.thr_count++;
        (void) mutex_unlock(&barrier.lock);

        if ((cmd == CMD_OFFLINE) ||
            (cmd == CMD_SUSPEND) ||
            (cmd == CMD_GETINFO)) {
                /*
                 * For these operations, need to ask modules to
                 * register any new resources that came online.
                 *
                 * This is because mount/umount are not instrumented
                 * to register with rcm before using system resources.
                 * Certain registration ops may fail during sync, which
                 * indicates race conditions. This cannot be avoided
                 * without changing mount/umount.
                 */
                rcmd_db_sync();
        }

        return (seq_num);
}

/*
 * Decrement thread count
 */
void
rcmd_thr_decr()
{
        /*
         * Decrement thread count and wake up reload/cleanup thread.
         */
        (void) mutex_lock(&barrier.lock);
        barrier.last_update = time(NULL);
        if (--barrier.thr_count == 0)
                (void) cond_broadcast(&barrier.cv);
        (void) mutex_unlock(&barrier.lock);
}

/*
 * Wakeup all waiting threads as a result of SIGHUP
 */
static int sighup_received = 0;

void
rcmd_thr_signal()
{
        (void) mutex_lock(&barrier.lock);
        sighup_received = 1;
        (void) cond_broadcast(&barrier.cv);
        (void) mutex_unlock(&barrier.lock);
}

void
rcmd_start_timer(int timeout)
{
        timestruc_t abstime;

        if (timeout == 0)
                timeout = RCM_DAEMON_TIMEOUT;   /* default to 5 minutes */
        else
                dr_req_list->idle_timeout = timeout;    /* persist timeout */

        if (timeout > 0) {
                abstime.tv_sec = time(NULL) + timeout;
        }

        (void) mutex_lock(&barrier.lock);
        for (;;) {
                int idletime;
                int is_active;

                if (timeout > 0)
                        (void) cond_timedwait(&barrier.cv, &barrier.lock,
                            &abstime);
                else
                        (void) cond_wait(&barrier.cv, &barrier.lock);

                /*
                 * If sighup received, change timeout to 0 so the daemon is
                 * shut down at the first possible moment
                 */
                if (sighup_received)
                        timeout = 0;

                /*
                 * If timeout is negative, never shutdown the daemon
                 */
                if (timeout < 0)
                        continue;

                /*
                 * Check for ongoing/pending activity
                 */
                is_active = (barrier.thr_count || barrier.wanted ||
                    (dr_req_list->n_req != 0));
                if (is_active) {
                        abstime.tv_sec = time(NULL) + timeout;
                        continue;
                }

                /*
                 * If idletime is less than timeout, continue to wait
                 */
                idletime = time(NULL) - barrier.last_update;
                if (idletime < timeout) {
                        abstime.tv_sec = barrier.last_update + timeout;
                        continue;
                }
                break;
        }

        (void) script_main_fini();

        rcm_log_message(RCM_INFO, gettext("rcm_daemon is shut down.\n"));
}

/*
 * Code related to polling client pid's
 * Not declared as static so that we can find this structure easily
 * in the core file.
 */
struct {
        int             n_pids;
        int             n_max_pids;
        thread_t        poll_tid;       /* poll thread id */
        int             signaled;
        pid_t           *pids;
        int             *refcnt;
        struct pollfd   *fds;
        cond_t          cv;     /* the associated lock is rcm_req_lock */
} polllist;

static int
find_pid_index(pid_t pid)
{
        int i;

        for (i = 0; i < polllist.n_pids; i++) {
                if (polllist.pids[i] == pid) {
                        return (i);
                }
        }
        return (-1);
}

/*
 * Resize buffer for new pids
 */
static int
get_pid_index()
{
        const int n_chunk = 10;

        int n_max;
        int index = polllist.n_pids;

        if (polllist.n_pids < polllist.n_max_pids) {
                polllist.n_pids++;
                return (index);
        }

        if (polllist.n_max_pids == 0) {
                n_max = n_chunk;
                polllist.pids = s_calloc(n_max, sizeof (pid_t));
                polllist.refcnt = s_calloc(n_max, sizeof (int));
                polllist.fds = s_calloc(n_max, sizeof (struct pollfd));
        } else {
                n_max = polllist.n_max_pids + n_chunk;
                polllist.pids = s_realloc(polllist.pids,
                    n_max * sizeof (pid_t));
                polllist.refcnt = s_realloc(polllist.refcnt,
                    n_max * sizeof (int));
                polllist.fds = s_realloc(polllist.fds,
                    n_max * sizeof (struct pollfd));
        }
        polllist.n_max_pids = n_max;
        polllist.n_pids++;
        return (index);
}

/*
 * rcm_req_lock must be held
 */
static void
add_to_polling_list(pid_t pid)
{
        int fd, index;
        char procfile[MAXPATHLEN];

        if (pid == (pid_t)0)
                return;

        rcm_log_message(RCM_TRACE1, "add_to_polling_list(%ld)\n", pid);

        /*
         * Need to stop the poll thread before manipulating the polllist
         * since poll thread may possibly be using polllist.fds[] and
         * polllist.n_pids. As an optimization, first check if the pid
         * is already in the polllist. If it is, there is no need to
         * stop the poll thread. Just increment the pid reference count
         * and return;
         */
        index = find_pid_index(pid);
        if (index != -1) {
                polllist.refcnt[index]++;
                return;
        }

        stop_polling_thread();

        /*
         * In an attempt to stop the poll thread we may have released
         * and reacquired rcm_req_lock. So find the index again.
         */
        index = find_pid_index(pid);
        if (index != -1) {
                polllist.refcnt[index]++;
                goto done;
        }

        /*
         * Open a /proc file
         */
        (void) sprintf(procfile, "/proc/%ld/as", pid);
        if ((fd = open(procfile, O_RDONLY)) == -1) {
                rcm_log_message(RCM_NOTICE, gettext("open(%s): %s\n"),
                    procfile, strerror(errno));
                goto done;
        }

        /*
         * add pid to polllist
         */
        index = get_pid_index();
        polllist.pids[index] = pid;
        polllist.refcnt[index] = 1;
        polllist.fds[index].fd = fd;
        polllist.fds[index].events = 0;
        polllist.fds[index].revents = 0;

        rcm_log_message(RCM_DEBUG, "add pid %ld at index %ld\n", pid, index);

done:
        start_polling_thread();
}

/*
 * rcm_req_lock must be held
 */
static void
remove_from_polling_list(pid_t pid)
{
        int i, index;

        if (pid == (pid_t)0)
                return;

        rcm_log_message(RCM_TRACE1, "remove_from_polling_list(%ld)\n", pid);

        /*
         * Need to stop the poll thread before manipulating the polllist
         * since poll thread may possibly be using polllist.fds[] and
         * polllist.n_pids. As an optimization, first check the pid
         * reference count. If the pid reference count is greater than 1
         * there is no need to stop the polling thread.
         */

        index = find_pid_index(pid);
        if (index == -1) {
                rcm_log_message(RCM_NOTICE,
                    gettext("error removing pid %ld from polling list\n"), pid);
                return;
        }

        /*
         * decrement the pid refcnt
         */
        if (polllist.refcnt[index] > 1) {
                polllist.refcnt[index]--;
                return;
        }

        stop_polling_thread();

        /*
         * In an attempt to stop the poll thread we may have released
         * and reacquired rcm_req_lock. So find the index again.
         */
        index = find_pid_index(pid);
        if (index == -1) {
                rcm_log_message(RCM_NOTICE,
                    gettext("error removing pid %ld from polling list\n"), pid);
                goto done;
        }

        if (--polllist.refcnt[index] > 0)
                goto done;

        /*
         * refcnt down to zero, delete pid from polling list
         */
        (void) close(polllist.fds[index].fd);
        polllist.n_pids--;

        for (i = index; i < polllist.n_pids; i++) {
                polllist.pids[i] = polllist.pids[i + 1];
                polllist.refcnt[i] = polllist.refcnt[i + 1];
                bcopy(&polllist.fds[i + 1], &polllist.fds[i],
                    sizeof (struct pollfd));
        }

        rcm_log_message(RCM_DEBUG, "remove pid %ld at index %d\n", pid, index);

done:
        start_polling_thread();
}

void
init_poll_thread()
{
        polllist.poll_tid = (thread_t)-1;
}

void
cleanup_poll_thread()
{
        (void) mutex_lock(&rcm_req_lock);
        if (polllist.poll_tid == thr_self()) {
                rcm_log_message(RCM_TRACE2,
                    "cleanup_poll_thread: n_pids = %d\n", polllist.n_pids);
                polllist.poll_tid = (thread_t)-1;
                (void) cond_broadcast(&polllist.cv);
        }
        (void) mutex_unlock(&rcm_req_lock);
}

/*ARGSUSED*/
static void *
pollfunc(void *arg)
{
        sigset_t mask;

        rcm_log_message(RCM_TRACE2, "poll thread started. n_pids = %d\n",
            polllist.n_pids);

        /*
         * Unblock SIGUSR1 to allow polling thread to be killed
         */
        (void) sigemptyset(&mask);
        (void) sigaddset(&mask, SIGUSR1);
        (void) thr_sigsetmask(SIG_UNBLOCK, &mask, NULL);

        (void) poll(polllist.fds, polllist.n_pids, (time_t)-1);

        /*
         * block SIGUSR1 to avoid being killed while holding a lock
         */
        (void) sigemptyset(&mask);
        (void) sigaddset(&mask, SIGUSR1);
        (void) thr_sigsetmask(SIG_BLOCK, &mask, NULL);

        rcm_log_message(RCM_TRACE2, "returned from poll()\n");

        cleanup_poll_thread();

        (void) mutex_lock(&barrier.lock);
        need_cleanup = 1;
        (void) cond_broadcast(&barrier.cv);
        (void) mutex_unlock(&barrier.lock);

        return (NULL);
}

/*
 * rcm_req_lock must be held
 */
void
start_polling_thread()
{
        int err;

        if (rcmd_get_state() != RCMD_NORMAL)
                return;

        if (polllist.poll_tid != (thread_t)-1 || polllist.n_pids == 0)
                return;

        if ((err = thr_create(NULL, 0, pollfunc, NULL, THR_DETACHED,
            &polllist.poll_tid)) == 0)
                polllist.signaled = 0;
        else
                rcm_log_message(RCM_ERROR,
                    gettext("failed to create polling thread: %s\n"),
                    strerror(err));
}

/*
 * rcm_req_lock must be held
 */
static void
stop_polling_thread()
{
        int err;

        while (polllist.poll_tid != (thread_t)-1) {
                if (polllist.signaled == 0) {
                        if ((err = thr_kill(polllist.poll_tid, SIGUSR1)) == 0)
                                polllist.signaled = 1;
                        else
                                /*
                                 * thr_kill shouldn't have failed since the
                                 * poll thread id and the signal are valid.
                                 * So log an error. Since when thr_kill
                                 * fails no signal is sent (as per man page),
                                 * the cond_wait below will wait until the
                                 * the poll thread exits by some other means.
                                 * The poll thread, for example, exits on its
                                 * own when any DR initiator process that it
                                 * is currently polling exits.
                                 */
                                rcm_log_message(RCM_ERROR,
                                    gettext(
                                    "fail to kill polling thread %d: %s\n"),
                                    polllist.poll_tid, strerror(err));
                }
                (void) cond_wait(&polllist.cv, &rcm_req_lock);
        }
}