root/usr/src/cmd/cmd-inet/usr.lib/ilbd/ilbd_hc.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2012 Milan Jurik. All rights reserved.
 */

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/list.h>
#include <sys/stropts.h>
#include <sys/siginfo.h>
#include <sys/wait.h>
#include <arpa/inet.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <stdio.h>
#include <strings.h>
#include <stddef.h>
#include <unistd.h>
#include <libilb.h>
#include <port.h>
#include <time.h>
#include <signal.h>
#include <assert.h>
#include <errno.h>
#include <spawn.h>
#include <fcntl.h>
#include <limits.h>
#include "libilb_impl.h"
#include "ilbd.h"

/* Global list of HC objects */
list_t ilbd_hc_list;

/* Timer queue for all hc related timers. */
static iu_tq_t *ilbd_hc_timer_q;

/* Indicate whether the timer needs to be updated */
static boolean_t hc_timer_restarted;

static void ilbd_hc_probe_timer(iu_tq_t *, void *);
static ilb_status_t ilbd_hc_restart_timer(ilbd_hc_t *, ilbd_hc_srv_t *);
static boolean_t ilbd_run_probe(ilbd_hc_srv_t *);

#define MAX(a, b)       ((a) > (b) ? (a) : (b))

/*
 * Number of arguments passed to a probe.  argc[0] is the path name of
 * the probe.
 */
#define HC_PROBE_ARGC   8

/*
 * Max number of characters to be read from the output of a probe.  It
 * is long enough to read in a 64 bit integer.
 */
#define HC_MAX_PROBE_OUTPUT     24

void
i_ilbd_setup_hc_list(void)
{
        list_create(&ilbd_hc_list, sizeof (ilbd_hc_t),
            offsetof(ilbd_hc_t, ihc_link));
}

/*
 * Given a hc object name, return a pointer to hc object if found.
 */
ilbd_hc_t *
ilbd_get_hc(const char *name)
{
        ilbd_hc_t *hc;

        for (hc = list_head(&ilbd_hc_list); hc != NULL;
            hc = list_next(&ilbd_hc_list, hc)) {
                if (strcasecmp(hc->ihc_name, name) == 0)
                        return (hc);
        }
        return (NULL);
}

/*
 * Generates an audit record for create-healthcheck,
 * delete-healtcheck subcommands.
 */
static void
ilbd_audit_hc_event(const char *audit_hcname,
    const ilb_hc_info_t *audit_hcinfo, ilbd_cmd_t cmd,
    ilb_status_t rc, ucred_t *ucredp)
{
        adt_session_data_t      *ah;
        adt_event_data_t        *event;
        au_event_t      flag;
        int     audit_error;

        if ((ucredp == NULL) && (cmd == ILBD_CREATE_HC))  {
                /*
                 * we came here from the path where ilbd incorporates
                 * the configuration that is listed in SCF:
                 * i_ilbd_read_config->ilbd_walk_hc_pgs->
                 *   ->ilbd_scf_instance_walk_pg->ilbd_create_hc
                 * We skip auditing in that case
                 */
                logdebug("ilbd_audit_hc_event: skipping auditing");
                return;
        }

        if (adt_start_session(&ah, NULL, 0) != 0) {
                logerr("ilbd_audit_hc_event: adt_start_session failed");
                exit(EXIT_FAILURE);
        }
        if (adt_set_from_ucred(ah, ucredp, ADT_NEW) != 0) {
                (void) adt_end_session(ah);
                logerr("ilbd_audit_rule_event: adt_set_from_ucred failed");
                exit(EXIT_FAILURE);
        }
        if (cmd == ILBD_CREATE_HC)
                flag = ADT_ilb_create_healthcheck;
        else if (cmd == ILBD_DESTROY_HC)
                flag = ADT_ilb_delete_healthcheck;

        if ((event = adt_alloc_event(ah, flag)) == NULL) {
                logerr("ilbd_audit_hc_event: adt_alloc_event failed");
                exit(EXIT_FAILURE);
        }
        (void) memset((char *)event, 0, sizeof (adt_event_data_t));

        switch (cmd) {
        case ILBD_CREATE_HC:
                event->adt_ilb_create_healthcheck.auth_used =
                    NET_ILB_CONFIG_AUTH;
                event->adt_ilb_create_healthcheck.hc_test =
                    (char *)audit_hcinfo->hci_test;
                event->adt_ilb_create_healthcheck.hc_name =
                    (char *)audit_hcinfo->hci_name;

                /*
                 * If the value 0 is stored, the default values are
                 * set in the kernel. User land does not know about them
                 * So if the user does not specify them, audit record
                 * will show them as 0
                 */
                event->adt_ilb_create_healthcheck.hc_timeout =
                    audit_hcinfo->hci_timeout;
                event->adt_ilb_create_healthcheck.hc_count =
                    audit_hcinfo->hci_count;
                event->adt_ilb_create_healthcheck.hc_interval =
                    audit_hcinfo->hci_interval;
                break;
        case ILBD_DESTROY_HC:
                event->adt_ilb_delete_healthcheck.auth_used =
                    NET_ILB_CONFIG_AUTH;
                event->adt_ilb_delete_healthcheck.hc_name =
                    (char *)audit_hcname;
                break;
        }

        /* Fill in success/failure */
        if (rc == ILB_STATUS_OK) {
                if (adt_put_event(event, ADT_SUCCESS, ADT_SUCCESS) != 0) {
                        logerr("ilbd_audit_hc_event: adt_put_event failed");
                        exit(EXIT_FAILURE);
                }
        } else {
                audit_error = ilberror2auditerror(rc);
                if (adt_put_event(event, ADT_FAILURE, audit_error) != 0) {
                        logerr("ilbd_audit_hc_event: adt_put_event failed");
                        exit(EXIT_FAILURE);
                }
        }
        adt_free_event(event);
        (void) adt_end_session(ah);
}

/*
 * Given the ilb_hc_info_t passed in (from the libilb), create a hc object
 * in ilbd.  The parameter ev_port is not used, refer to comments of
 * ilbd_create_sg() in ilbd_sg.c
 */
/* ARGSUSED */
ilb_status_t
ilbd_create_hc(const ilb_hc_info_t *hc_info, int ev_port,
    const struct passwd *ps, ucred_t *ucredp)
{
        ilbd_hc_t *hc;
        ilb_status_t ret = ILB_STATUS_OK;

        /*
         * ps == NULL is from the daemon when it starts and load configuration
         * ps != NULL is from client.
         */
        if (ps != NULL) {
                ret = ilbd_check_client_config_auth(ps);
                if (ret != ILB_STATUS_OK) {
                        ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                            ret, ucredp);
                        return (ret);
                }
        }

        if (hc_info->hci_name[0] == '\0') {
                logdebug("ilbd_create_hc: missing healthcheck info");
                ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                    ILB_STATUS_ENOHCINFO, ucredp);
                return (ILB_STATUS_ENOHCINFO);
        }

        hc = ilbd_get_hc(hc_info->hci_name);
        if (hc != NULL) {
                logdebug("ilbd_create_hc: healthcheck name %s already"
                    " exists", hc_info->hci_name);
                ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                    ILB_STATUS_EEXIST, ucredp);
                return (ILB_STATUS_EEXIST);
        }

        /*
         * Sanity check on user supplied probe.  The given path name
         * must be a full path name (starts with '/') and is
         * executable.
         */
        if (strcasecmp(hc_info->hci_test, ILB_HC_STR_TCP) != 0 &&
            strcasecmp(hc_info->hci_test, ILB_HC_STR_UDP) != 0 &&
            strcasecmp(hc_info->hci_test, ILB_HC_STR_PING) != 0 &&
            (hc_info->hci_test[0] != '/' ||
            access(hc_info->hci_test, X_OK) == -1)) {
                if (errno == ENOENT) {
                        logdebug("ilbd_create_hc: user script %s doesn't "
                            "exist", hc_info->hci_test);
                        ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                            ILB_STATUS_ENOENT, ucredp);
                        return (ILB_STATUS_ENOENT);
                } else {
                        logdebug("ilbd_create_hc: user script %s is "
                            "invalid", hc_info->hci_test);
                        ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                            ILB_STATUS_EINVAL, ucredp);
                        return (ILB_STATUS_EINVAL);
                }
        }

        /* Create and add the hc object */
        hc = calloc(1, sizeof (ilbd_hc_t));
        if (hc == NULL) {
                ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                    ILB_STATUS_ENOMEM, ucredp);
                return (ILB_STATUS_ENOMEM);
        }
        (void) memcpy(&hc->ihc_info, hc_info, sizeof (ilb_hc_info_t));
        if (strcasecmp(hc->ihc_test, ILB_HC_STR_TCP) == 0)
                hc->ihc_test_type = ILBD_HC_TCP;
        else if (strcasecmp(hc->ihc_test, ILB_HC_STR_UDP) == 0)
                hc->ihc_test_type = ILBD_HC_UDP;
        else if (strcasecmp(hc->ihc_test, ILB_HC_STR_PING) == 0)
                hc->ihc_test_type = ILBD_HC_PING;
        else
                hc->ihc_test_type = ILBD_HC_USER;
        list_create(&hc->ihc_rules, sizeof (ilbd_hc_rule_t),
            offsetof(ilbd_hc_rule_t, hcr_link));

        /* Update SCF */
        if (ps != NULL) {
                if ((ret = ilbd_create_pg(ILBD_SCF_HC, (void *)hc)) !=
                    ILB_STATUS_OK) {
                        ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
                            ret, ucredp);
                        list_destroy(&hc->ihc_rules);
                        free(hc);
                        return (ret);
                }
        }

        /* Everything is fine, now add it to the global list. */
        list_insert_tail(&ilbd_hc_list, hc);
        ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC, ret, ucredp);
        return (ret);
}

/*
 * Given a name of a hc object, destroy it.
 */
ilb_status_t
ilbd_destroy_hc(const char *hc_name, const struct passwd *ps,
    ucred_t *ucredp)
{
        ilb_status_t ret;
        ilbd_hc_t *hc;

        /*
         * No need to check ps == NULL, daemon won't call any destroy func
         * at start up.
         */
        ret = ilbd_check_client_config_auth(ps);
        if (ret != ILB_STATUS_OK) {
                ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
                    ret, ucredp);
                return (ret);
        }

        hc = ilbd_get_hc(hc_name);
        if (hc == NULL) {
                logdebug("ilbd_destroy_hc: healthcheck %s does not exist",
                    hc_name);
                ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
                    ILB_STATUS_ENOENT, ucredp);
                return (ILB_STATUS_ENOENT);
        }

        /* If hc is in use, cannot delete it */
        if (hc->ihc_rule_cnt > 0) {
                logdebug("ilbd_destroy_hc: healthcheck %s is associated"
                    " with a rule - cannot remove", hc_name);
                ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
                    ILB_STATUS_INUSE, ucredp);
                return (ILB_STATUS_INUSE);
        }

        if ((ret = ilbd_destroy_pg(ILBD_SCF_HC, hc_name)) !=
            ILB_STATUS_OK) {
                logdebug("ilbd_destroy_hc: cannot destroy healthcheck %s "
                    "property group", hc_name);
                ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
                    ret, ucredp);
                return (ret);
        }

        list_remove(&ilbd_hc_list, hc);
        list_destroy(&hc->ihc_rules);
        free(hc);
        ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC, ret, ucredp);
        return (ret);
}

/*
 * Given a hc object name, return its information.  Used by libilb to
 * get hc info.
 */
ilb_status_t
ilbd_get_hc_info(const char *hc_name, uint32_t *rbuf, size_t *rbufsz)
{
        ilbd_hc_t       *hc;
        ilb_hc_info_t   *hc_info;
        ilb_comm_t      *ic = (ilb_comm_t *)rbuf;

        hc = ilbd_get_hc(hc_name);
        if (hc == NULL) {
                logdebug("%s: healthcheck %s does not exist", __func__,
                    hc_name);
                return (ILB_STATUS_ENOENT);
        }
        ilbd_reply_ok(rbuf, rbufsz);
        hc_info = (ilb_hc_info_t *)&ic->ic_data;

        (void) strlcpy(hc_info->hci_name, hc->ihc_name, sizeof (hc->ihc_name));
        (void) strlcpy(hc_info->hci_test, hc->ihc_test, sizeof (hc->ihc_test));
        hc_info->hci_timeout = hc->ihc_timeout;
        hc_info->hci_count = hc->ihc_count;
        hc_info->hci_interval = hc->ihc_interval;
        hc_info->hci_def_ping = hc->ihc_def_ping;

        *rbufsz += sizeof (ilb_hc_info_t);

        return (ILB_STATUS_OK);
}

static void
ilbd_hc_copy_srvs(uint32_t *rbuf, size_t *rbufsz, ilbd_hc_rule_t *hc_rule,
    const char *rulename)
{
        ilbd_hc_srv_t           *tmp_srv;
        ilb_hc_srv_t            *dst_srv;
        ilb_hc_rule_srv_t       *srvs;
        size_t                  tmp_rbufsz;
        int                     i;

        tmp_rbufsz = *rbufsz;
        /* Set up the reply buffer.  rbufsz will be set to the new size. */
        ilbd_reply_ok(rbuf, rbufsz);

        /* Calculate how much space is left for holding server info. */
        *rbufsz += sizeof (ilb_hc_rule_srv_t);
        tmp_rbufsz -= *rbufsz;

        srvs = (ilb_hc_rule_srv_t *)&((ilb_comm_t *)rbuf)->ic_data;

        tmp_srv = list_head(&hc_rule->hcr_servers);
        for (i = 0; tmp_srv != NULL && tmp_rbufsz >= sizeof (*dst_srv); i++) {
                dst_srv = &srvs->rs_srvs[i];

                (void) strlcpy(dst_srv->hcs_rule_name, rulename, ILB_NAMESZ);
                (void) strlcpy(dst_srv->hcs_ID, tmp_srv->shc_sg_srv->sgs_srvID,
                    ILB_NAMESZ);
                (void) strlcpy(dst_srv->hcs_hc_name,
                    tmp_srv->shc_hc->ihc_name, ILB_NAMESZ);
                dst_srv->hcs_IP = tmp_srv->shc_sg_srv->sgs_addr;
                dst_srv->hcs_fail_cnt = tmp_srv->shc_fail_cnt;
                dst_srv->hcs_status = tmp_srv->shc_status;
                dst_srv->hcs_rtt = tmp_srv->shc_rtt;
                dst_srv->hcs_lasttime = tmp_srv->shc_lasttime;
                dst_srv->hcs_nexttime = tmp_srv->shc_nexttime;

                tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv);
                tmp_rbufsz -= sizeof (*dst_srv);
        }
        srvs->rs_num_srvs = i;
        *rbufsz += i * sizeof (*dst_srv);
}

/*
 * Given a rule name, return the hc status of its servers.
 */
ilb_status_t
ilbd_get_hc_srvs(const char *rulename, uint32_t *rbuf, size_t *rbufsz)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;

        for (hc = list_head(&ilbd_hc_list); hc != NULL;
            hc = list_next(&ilbd_hc_list, hc)) {
                for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
                    hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
                        if (strcasecmp(hc_rule->hcr_rule->irl_name,
                            rulename) != 0) {
                                continue;
                        }
                        ilbd_hc_copy_srvs(rbuf, rbufsz, hc_rule, rulename);
                        return (ILB_STATUS_OK);
                }
        }
        return (ILB_STATUS_RULE_NO_HC);
}

/*
 * Initialize the hc timer and associate the notification of timeout to
 * the given event port.
 */
void
ilbd_hc_timer_init(int ev_port, ilbd_timer_event_obj_t *ev_obj)
{
        struct sigevent sigev;
        port_notify_t notify;

        if ((ilbd_hc_timer_q = iu_tq_create()) == NULL) {
                logerr("%s: cannot create hc timer queue", __func__);
                exit(EXIT_FAILURE);
        }
        hc_timer_restarted = B_FALSE;

        ev_obj->ev = ILBD_EVENT_TIMER;
        ev_obj->timerid = -1;

        notify.portnfy_port = ev_port;
        notify.portnfy_user = ev_obj;
        sigev.sigev_notify = SIGEV_PORT;
        sigev.sigev_value.sival_ptr = &notify;
        if (timer_create(CLOCK_REALTIME, &sigev, &ev_obj->timerid) == -1) {
                logerr("%s: cannot create timer", __func__);
                exit(EXIT_FAILURE);
        }
}

/*
 * HC timeout handler.
 */
void
ilbd_hc_timeout(void)
{
        (void) iu_expire_timers(ilbd_hc_timer_q);
        hc_timer_restarted = B_TRUE;
}

/*
 * Set up the timer to fire at the earliest timeout.
 */
void
ilbd_hc_timer_update(ilbd_timer_event_obj_t *ev_obj)
{
        itimerspec_t itimeout;
        int timeout;

        /*
         * There is no change on the timer list, so no need to set up the
         * timer again.
         */
        if (!hc_timer_restarted)
                return;

restart:
        if ((timeout = iu_earliest_timer(ilbd_hc_timer_q)) == INFTIM) {
                hc_timer_restarted = B_FALSE;
                return;
        } else if (timeout == 0) {
                /*
                 * Handle the timeout immediately.  After that (clearing all
                 * the expired timers), check to  see if there are still
                 * timers running.  If yes, start them.
                 */
                (void) iu_expire_timers(ilbd_hc_timer_q);
                goto restart;
        }

        itimeout.it_value.tv_sec = timeout / MILLISEC + 1;
        itimeout.it_value.tv_nsec = 0;
        itimeout.it_interval.tv_sec = 0;
        itimeout.it_interval.tv_nsec = 0;

        /*
         * Failure to set a timeout is "OK" since hopefully there will be
         * other events and timer_settime() will be called again.  So
         * we will only miss some timeouts.  But in the worst case, no event
         * will happen and ilbd will get stuck...
         */
        if (timer_settime(ev_obj->timerid, 0, &itimeout, NULL) == -1)
                logerr("%s: cannot set timer", __func__);
        hc_timer_restarted = B_FALSE;
}

/*
 * Kill the probe process of a server.
 */
static void
ilbd_hc_kill_probe(ilbd_hc_srv_t *srv)
{
        /*
         * First dissociate the fd from the event port.  It should not
         * fail.
         */
        if (port_dissociate(srv->shc_ev_port, PORT_SOURCE_FD,
            srv->shc_child_fd) != 0) {
                logdebug("%s: port_dissociate: %s", __func__, strerror(errno));
        }
        (void) close(srv->shc_child_fd);
        free(srv->shc_ev);
        srv->shc_ev = NULL;

        /* Then kill the probe process. */
        if (kill(srv->shc_child_pid, SIGKILL) != 0) {
                logerr("%s: rule %s server %s: %s", __func__,
                    srv->shc_hc_rule->hcr_rule->irl_name,
                    srv->shc_sg_srv->sgs_srvID, strerror(errno));
        }
        /* Should not fail... */
        if (waitpid(srv->shc_child_pid, NULL, 0) != srv->shc_child_pid) {
                logdebug("%s: waitpid: rule %s server %s", __func__,
                    srv->shc_hc_rule->hcr_rule->irl_name,
                    srv->shc_sg_srv->sgs_srvID);
        }
        srv->shc_child_pid = 0;
}

/*
 * Disable the server, either because the server is dead or because a timer
 * cannot be started for this server.  Note that this only affects the
 * transient configuration, meaning only in memory.  The persistent
 * configuration is not affected.
 */
static void
ilbd_mark_server_disabled(ilbd_hc_srv_t *srv)
{
        srv->shc_status = ILB_HCS_DISABLED;

        /* Disable the server in kernel. */
        if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
            srv->shc_hc_rule->hcr_rule->irl_name,
            stat_declare_srv_dead) != ILB_STATUS_OK) {
                logerr("%s: cannot disable server in kernel: rule %s "
                    "server %s", __func__,
                    srv->shc_hc_rule->hcr_rule->irl_name,
                    srv->shc_sg_srv->sgs_srvID);
        }
}

/*
 * A probe fails, set the state of the server.
 */
static void
ilbd_set_fail_state(ilbd_hc_srv_t *srv)
{
        if (++srv->shc_fail_cnt < srv->shc_hc->ihc_count) {
                /* Probe again */
                ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
                return;
        }

        logdebug("%s: rule %s server %s fails %u", __func__,
            srv->shc_hc_rule->hcr_rule->irl_name, srv->shc_sg_srv->sgs_srvID,
            srv->shc_fail_cnt);

        /*
         * If this is a ping test, mark the server as
         * unreachable instead of dead.
         */
        if (srv->shc_hc->ihc_test_type == ILBD_HC_PING ||
            srv->shc_state == ilbd_hc_def_pinging) {
                srv->shc_status = ILB_HCS_UNREACH;
        } else {
                srv->shc_status = ILB_HCS_DEAD;
        }

        /* Disable the server in kernel. */
        if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
            srv->shc_hc_rule->hcr_rule->irl_name, stat_declare_srv_dead) !=
            ILB_STATUS_OK) {
                logerr("%s: cannot disable server in kernel: rule %s "
                    "server %s", __func__,
                    srv->shc_hc_rule->hcr_rule->irl_name,
                    srv->shc_sg_srv->sgs_srvID);
        }

        /* Still keep probing in case the server is alive again. */
        if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
                /* Only thing to do is to disable the server... */
                logerr("%s: cannot restart timer: rule %s server %s", __func__,
                    srv->shc_hc_rule->hcr_rule->irl_name,
                    srv->shc_sg_srv->sgs_srvID);
                srv->shc_status = ILB_HCS_DISABLED;
        }
}

/*
 * A probe process has not returned for the ihc_timeout period, we should
 * kill it.  This function is the handler of this.
 */
/* ARGSUSED */
static void
ilbd_hc_kill_timer(iu_tq_t *tq, void *arg)
{
        ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;

        ilbd_hc_kill_probe(srv);
        ilbd_set_fail_state(srv);
}

/*
 * Probe timeout handler.  Send out the appropriate probe.
 */
/* ARGSUSED */
static void
ilbd_hc_probe_timer(iu_tq_t *tq, void *arg)
{
        ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;

        /*
         * If starting the probe fails, just pretend that the timeout has
         * extended.
         */
        if (!ilbd_run_probe(srv)) {
                /*
                 * If we cannot restart the timer, the only thing we can do
                 * is to disable this server.  Hopefully the sys admin will
                 * notice this and enable this server again later.
                 */
                if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
                        logerr("%s: cannot restart timer: rule %s server %s, "
                            "disabling it", __func__,
                            srv->shc_hc_rule->hcr_rule->irl_name,
                            srv->shc_sg_srv->sgs_srvID);
                        ilbd_mark_server_disabled(srv);
                }
                return;
        }

        /*
         * Similar to above, if kill timer cannot be started, disable the
         * server.
         */
        if ((srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q,
            srv->shc_hc->ihc_timeout, ilbd_hc_kill_timer, srv)) == -1) {
                logerr("%s: cannot start kill timer: rule %s server %s, "
                    "disabling it", __func__,
                    srv->shc_hc_rule->hcr_rule->irl_name,
                    srv->shc_sg_srv->sgs_srvID);
                ilbd_mark_server_disabled(srv);
        }
        hc_timer_restarted = B_TRUE;
}

/* Restart the periodic timer for a given server. */
static ilb_status_t
ilbd_hc_restart_timer(ilbd_hc_t *hc, ilbd_hc_srv_t *srv)
{
        int timeout;

        /* Don't allow the timeout interval to be less than 1s */
        timeout = MAX((hc->ihc_interval >> 1) + (gethrtime() %
            (hc->ihc_interval + 1)), 1);

        /*
         * If the probe is actually a ping probe, there is no need to
         * do default pinging.  Just skip the step.
         */
        if (hc->ihc_def_ping && hc->ihc_test_type != ILBD_HC_PING)
                srv->shc_state = ilbd_hc_def_pinging;
        else
                srv->shc_state = ilbd_hc_probing;
        srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q, timeout,
            ilbd_hc_probe_timer, srv);

        if (srv->shc_tid == -1)
                return (ILB_STATUS_TIMER);
        srv->shc_lasttime = time(NULL);
        srv->shc_nexttime = time(NULL) + timeout;

        hc_timer_restarted = B_TRUE;
        return (ILB_STATUS_OK);
}

/* Helper routine to associate a server with its hc object. */
static ilb_status_t
ilbd_hc_srv_add(ilbd_hc_t *hc, ilbd_hc_rule_t *hc_rule,
    const ilb_sg_srv_t *srv, int ev_port)
{
        ilbd_hc_srv_t *new_srv;
        ilb_status_t ret;

        if ((new_srv = calloc(1, sizeof (ilbd_hc_srv_t))) == NULL)
                return (ILB_STATUS_ENOMEM);
        new_srv->shc_hc = hc;
        new_srv->shc_hc_rule = hc_rule;
        new_srv->shc_sg_srv = srv;
        new_srv->shc_ev_port = ev_port;
        new_srv->shc_tid = -1;
        new_srv->shc_nexttime = time(NULL);
        new_srv->shc_lasttime = new_srv->shc_nexttime;

        if ((hc_rule->hcr_rule->irl_flags & ILB_FLAGS_RULE_ENABLED) &&
            ILB_IS_SRV_ENABLED(srv->sgs_flags)) {
                new_srv->shc_status = ILB_HCS_UNINIT;
                ret = ilbd_hc_restart_timer(hc, new_srv);
                if (ret != ILB_STATUS_OK) {
                        free(new_srv);
                        return (ret);
                }
        } else {
                new_srv->shc_status = ILB_HCS_DISABLED;
        }

        list_insert_tail(&hc_rule->hcr_servers, new_srv);
        return (ILB_STATUS_OK);
}

/* Handy macro to cancel a server's timer. */
#define HC_CANCEL_TIMER(srv)                                            \
{                                                                       \
        void *arg;                                                      \
        int ret;                                                        \
        if ((srv)->shc_tid != -1) {                                     \
                ret = iu_cancel_timer(ilbd_hc_timer_q, (srv)->shc_tid, &arg); \
                (srv)->shc_tid = -1;                                    \
                assert(ret == 1);                                       \
                assert(arg == (srv));                                   \
        }                                                               \
        hc_timer_restarted = B_TRUE;                                    \
}

/* Helper routine to dissociate a server from its hc object. */
static ilb_status_t
ilbd_hc_srv_rem(ilbd_hc_rule_t *hc_rule, const ilb_sg_srv_t *srv)
{
        ilbd_hc_srv_t *tmp_srv;

        for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
            tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
                if (tmp_srv->shc_sg_srv == srv) {
                        list_remove(&hc_rule->hcr_servers, tmp_srv);
                        HC_CANCEL_TIMER(tmp_srv);
                        if (tmp_srv->shc_child_pid != 0)
                                ilbd_hc_kill_probe(tmp_srv);
                        free(tmp_srv);
                        return (ILB_STATUS_OK);
                }
        }
        return (ILB_STATUS_ENOENT);
}

/* Helper routine to dissociate all servers of a rule from its hc object. */
static void
ilbd_hc_srv_rem_all(ilbd_hc_rule_t *hc_rule)
{
        ilbd_hc_srv_t *srv;

        while ((srv = list_remove_head(&hc_rule->hcr_servers)) != NULL) {
                HC_CANCEL_TIMER(srv);
                if (srv->shc_child_pid != 0)
                        ilbd_hc_kill_probe(srv);
                free(srv);
        }
}

/* Associate a rule with its hc object. */
ilb_status_t
ilbd_hc_associate_rule(const ilbd_rule_t *rule, int ev_port)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;
        ilb_status_t    ret;
        ilbd_sg_t       *sg;
        ilbd_srv_t      *ilbd_srv;

        /* The rule is assumed to be initialized appropriately. */
        if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
                logdebug("ilbd_hc_associate_rule: healthcheck %s does not "
                    "exist", rule->irl_hcname);
                return (ILB_STATUS_ENOHCINFO);
        }
        if ((hc->ihc_test_type == ILBD_HC_TCP &&
            rule->irl_proto != IPPROTO_TCP) ||
            (hc->ihc_test_type == ILBD_HC_UDP &&
            rule->irl_proto != IPPROTO_UDP)) {
                return (ILB_STATUS_RULE_HC_MISMATCH);
        }
        if ((hc_rule = calloc(1, sizeof (ilbd_hc_rule_t))) == NULL) {
                logdebug("ilbd_hc_associate_rule: out of memory");
                return (ILB_STATUS_ENOMEM);
        }

        hc_rule->hcr_rule = rule;
        list_create(&hc_rule->hcr_servers, sizeof (ilbd_hc_srv_t),
            offsetof(ilbd_hc_srv_t, shc_srv_link));

        /* Add all the servers. */
        sg = rule->irl_sg;
        for (ilbd_srv = list_head(&sg->isg_srvlist); ilbd_srv != NULL;
            ilbd_srv = list_next(&sg->isg_srvlist, ilbd_srv)) {
                if ((ret = ilbd_hc_srv_add(hc, hc_rule, &ilbd_srv->isv_srv,
                    ev_port)) != ILB_STATUS_OK) {
                        /* Remove all previously added servers */
                        ilbd_hc_srv_rem_all(hc_rule);
                        list_destroy(&hc_rule->hcr_servers);
                        free(hc_rule);
                        return (ret);
                }
        }
        list_insert_tail(&hc->ihc_rules, hc_rule);
        hc->ihc_rule_cnt++;

        return (ILB_STATUS_OK);
}

/* Dissociate a rule from its hc object. */
ilb_status_t
ilbd_hc_dissociate_rule(const ilbd_rule_t *rule)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;

        /* The rule is assumed to be initialized appropriately. */
        if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
                logdebug("ilbd_hc_dissociate_rule: healthcheck %s does not "
                    "exist", rule->irl_hcname);
                return (ILB_STATUS_ENOENT);
        }
        for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
            hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
                if (hc_rule->hcr_rule == rule)
                        break;
        }
        if (hc_rule == NULL) {
                logdebug("ilbd_hc_dissociate_rule: rule %s is not associated "
                    "with healtcheck %s", rule->irl_hcname, hc->ihc_name);
                return (ILB_STATUS_ENOENT);
        }
        ilbd_hc_srv_rem_all(hc_rule);
        list_remove(&hc->ihc_rules, hc_rule);
        hc->ihc_rule_cnt--;
        list_destroy(&hc_rule->hcr_servers);
        free(hc_rule);
        return (ILB_STATUS_OK);
}

/*
 * Given a hc object name and a rule, check to see if the rule is associated
 * with the hc object.  If it is, the hc object is returned in **hc and the
 * ilbd_hc_rule_t is returned in **hc_rule.
 */
static boolean_t
ilbd_hc_check_rule(const char *hc_name, const ilbd_rule_t *rule,
    ilbd_hc_t **hc, ilbd_hc_rule_t **hc_rule)
{
        ilbd_hc_t       *tmp_hc;
        ilbd_hc_rule_t  *tmp_hc_rule;

        if ((tmp_hc = ilbd_get_hc(hc_name)) == NULL)
                return (B_FALSE);
        for (tmp_hc_rule = list_head(&tmp_hc->ihc_rules); tmp_hc_rule != NULL;
            tmp_hc_rule = list_next(&tmp_hc->ihc_rules, tmp_hc_rule)) {
                if (tmp_hc_rule->hcr_rule == rule) {
                        *hc = tmp_hc;
                        *hc_rule = tmp_hc_rule;
                        return (B_TRUE);
                }
        }
        return (B_FALSE);
}

/* Associate a server with its hc object. */
ilb_status_t
ilbd_hc_add_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
    int ev_port)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;

        if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
                return (ILB_STATUS_ENOENT);
        return (ilbd_hc_srv_add(hc, hc_rule, srv, ev_port));
}

/* Dissociate a server from its hc object. */
ilb_status_t
ilbd_hc_del_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;

        if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
                return (ILB_STATUS_ENOENT);
        return (ilbd_hc_srv_rem(hc_rule, srv));
}

/* Helper routine to enable/disable a server's hc probe. */
static ilb_status_t
ilbd_hc_toggle_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
    boolean_t enable)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;
        ilbd_hc_srv_t   *tmp_srv;
        ilb_status_t    ret;

        if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
                return (ILB_STATUS_ENOENT);
        for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
            tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
                if (tmp_srv->shc_sg_srv != srv) {
                        continue;
                }
                if (enable) {
                        if (tmp_srv->shc_status == ILB_HCS_DISABLED) {
                                ret = ilbd_hc_restart_timer(hc, tmp_srv);
                                if (ret != ILB_STATUS_OK) {
                                        logerr("%s: cannot start timers for "
                                            "rule %s server %s", __func__,
                                            rule->irl_name,
                                            tmp_srv->shc_sg_srv->sgs_srvID);
                                        return (ret);
                                }
                                /* Start from fresh... */
                                tmp_srv->shc_status = ILB_HCS_UNINIT;
                                tmp_srv->shc_rtt = 0;
                                tmp_srv->shc_fail_cnt = 0;
                        }
                } else {
                        if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
                                tmp_srv->shc_status = ILB_HCS_DISABLED;
                                HC_CANCEL_TIMER(tmp_srv);
                                if (tmp_srv->shc_child_pid != 0)
                                        ilbd_hc_kill_probe(tmp_srv);
                        }
                }
                return (ILB_STATUS_OK);
        }
        return (ILB_STATUS_ENOENT);
}

ilb_status_t
ilbd_hc_enable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
{
        return (ilbd_hc_toggle_server(rule, srv, B_TRUE));
}

ilb_status_t
ilbd_hc_disable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
{
        return (ilbd_hc_toggle_server(rule, srv, B_FALSE));
}

/*
 * Helper routine to enable/disable a rule's hc probe (including all its
 * servers).
 */
static ilb_status_t
ilbd_hc_toggle_rule(const ilbd_rule_t *rule, boolean_t enable)
{
        ilbd_hc_t       *hc;
        ilbd_hc_rule_t  *hc_rule;
        ilbd_hc_srv_t   *tmp_srv;
        int             ret;

        if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
                return (ILB_STATUS_ENOENT);

        for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
            tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
                if (enable) {
                        /*
                         * If the server is disabled in the rule, do not
                         * restart its timer.
                         */
                        if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
                            ILB_IS_SRV_ENABLED(
                            tmp_srv->shc_sg_srv->sgs_flags)) {
                                ret = ilbd_hc_restart_timer(hc, tmp_srv);
                                if (ret != ILB_STATUS_OK) {
                                        logerr("%s: cannot start timers for "
                                            "rule %s server %s", __func__,
                                            rule->irl_name,
                                            tmp_srv->shc_sg_srv->sgs_srvID);
                                        goto rollback;
                                } else {
                                        /* Start from fresh... */
                                        tmp_srv->shc_status = ILB_HCS_UNINIT;
                                        tmp_srv->shc_rtt = 0;
                                        tmp_srv->shc_fail_cnt = 0;
                                }
                        }
                } else {
                        if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
                                HC_CANCEL_TIMER(tmp_srv);
                                tmp_srv->shc_status = ILB_HCS_DISABLED;
                                if (tmp_srv->shc_child_pid != 0)
                                        ilbd_hc_kill_probe(tmp_srv);
                        }
                }
        }
        return (ILB_STATUS_OK);
rollback:
        enable = !enable;
        for (tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv);
            tmp_srv != NULL;
            tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv)) {
                if (enable) {
                        if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
                            ILB_IS_SRV_ENABLED(
                            tmp_srv->shc_sg_srv->sgs_flags)) {
                                (void) ilbd_hc_restart_timer(hc, tmp_srv);
                                tmp_srv->shc_status = ILB_HCS_UNINIT;
                                tmp_srv->shc_rtt = 0;
                                tmp_srv->shc_fail_cnt = 0;
                        }
                } else {
                        if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
                                HC_CANCEL_TIMER(tmp_srv);
                                tmp_srv->shc_status = ILB_HCS_DISABLED;
                                if (tmp_srv->shc_child_pid != 0)
                                        ilbd_hc_kill_probe(tmp_srv);
                        }
                }
        }
        return (ret);
}

ilb_status_t
ilbd_hc_enable_rule(const ilbd_rule_t *rule)
{
        return (ilbd_hc_toggle_rule(rule, B_TRUE));
}

ilb_status_t
ilbd_hc_disable_rule(const ilbd_rule_t *rule)
{
        return (ilbd_hc_toggle_rule(rule, B_FALSE));
}

static const char *
topo_2_str(ilb_topo_t topo)
{
        switch (topo) {
        case ILB_TOPO_DSR:
                return ("DSR");
        case ILB_TOPO_NAT:
                return ("NAT");
        case ILB_TOPO_HALF_NAT:
                return ("HALF_NAT");
        default:
                /* Should not happen. */
                logerr("%s: unknown topology", __func__);
                break;
        }
        return ("");
}

/*
 * Create the argument list to be passed to a hc probe command.
 * The passed in argv is assumed to have HC_PROBE_ARGC elements.
 */
static boolean_t
create_argv(ilbd_hc_srv_t *srv, char *argv[])
{
        char buf[INET6_ADDRSTRLEN];
        ilbd_rule_t const *rule;
        ilb_sg_srv_t const *sg_srv;
        struct in_addr v4_addr;
        in_port_t port;
        int i;

        rule = srv->shc_hc_rule->hcr_rule;
        sg_srv = srv->shc_sg_srv;

        if (srv->shc_state == ilbd_hc_def_pinging) {
                if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL)
                        return (B_FALSE);
        } else {
                switch (srv->shc_hc->ihc_test_type) {
                case ILBD_HC_USER:
                        if ((argv[0] = strdup(srv->shc_hc->ihc_test)) == NULL)
                                return (B_FALSE);
                        break;
                case ILBD_HC_TCP:
                case ILBD_HC_UDP:
                        if ((argv[0] = strdup(ILB_PROBE_PROTO)) ==
                            NULL) {
                                return (B_FALSE);
                        }
                        break;
                case ILBD_HC_PING:
                        if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL) {
                                return (B_FALSE);
                        }
                        break;
                }
        }

        /*
         * argv[1] is the VIP.
         *
         * Right now, the VIP and the backend server addresses should be
         * in the same IP address family.  Here we don't do that in case
         * this assumption is changed in future.
         */
        if (IN6_IS_ADDR_V4MAPPED(&rule->irl_vip)) {
                IN6_V4MAPPED_TO_INADDR(&rule->irl_vip, &v4_addr);
                if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
                        goto cleanup;
        } else {
                if (inet_ntop(AF_INET6, &rule->irl_vip, buf,
                    sizeof (buf)) == NULL) {
                        goto cleanup;
                }
        }
        if ((argv[1] = strdup(buf)) == NULL)
                goto cleanup;

        /*
         * argv[2] is the backend server address.
         */
        if (IN6_IS_ADDR_V4MAPPED(&sg_srv->sgs_addr)) {
                IN6_V4MAPPED_TO_INADDR(&sg_srv->sgs_addr, &v4_addr);
                if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
                        goto cleanup;
        } else {
                if (inet_ntop(AF_INET6, &sg_srv->sgs_addr, buf,
                    sizeof (buf)) == NULL) {
                        goto cleanup;
                }
        }
        if ((argv[2] = strdup(buf)) == NULL)
                goto cleanup;

        /*
         * argv[3] is the transport protocol used in the rule.
         */
        switch (rule->irl_proto) {
        case IPPROTO_TCP:
                argv[3] = strdup("TCP");
                break;
        case IPPROTO_UDP:
                argv[3] = strdup("UDP");
                break;
        default:
                logerr("%s: unknown protocol", __func__);
                goto cleanup;
        }
        if (argv[3] == NULL)
                goto cleanup;

        /*
         * argv[4] is the load balance mode, DSR, NAT, HALF-NAT.
         */
        if ((argv[4] = strdup(topo_2_str(rule->irl_topo))) == NULL)
                goto cleanup;

        /*
         * argv[5] is the port range.  Right now, there should only be 1 port.
         */
        switch (rule->irl_hcpflag) {
        case ILB_HCI_PROBE_FIX:
                port = ntohs(rule->irl_hcport);
                break;
        case ILB_HCI_PROBE_ANY: {
                in_port_t min, max;

                if (ntohs(sg_srv->sgs_minport) == 0) {
                        min = ntohs(rule->irl_minport);
                        max = ntohs(rule->irl_maxport);
                } else {
                        min = ntohs(sg_srv->sgs_minport);
                        max = ntohs(sg_srv->sgs_maxport);
                }
                if (max > min)
                        port = min + gethrtime() % (max - min + 1);
                else
                        port = min;
                break;
        }
        default:
                logerr("%s: unknown HC flag", __func__);
                goto cleanup;
        }
        (void) sprintf(buf, "%d", port);
        if ((argv[5] = strdup(buf)) == NULL)
                goto cleanup;

        /*
         * argv[6] is the probe timeout.
         */
        (void) sprintf(buf, "%d", srv->shc_hc->ihc_timeout);
        if ((argv[6] = strdup(buf)) == NULL)
                goto cleanup;

        argv[7] = NULL;
        return (B_TRUE);

cleanup:
        for (i = 0; i < HC_PROBE_ARGC; i++) {
                if (argv[i] != NULL)
                        free(argv[i]);
        }
        return (B_FALSE);
}

static void
destroy_argv(char *argv[])
{
        int i;

        for (i = 0; argv[i] != NULL; i++)
                free(argv[i]);
}

/* Spawn a process to run the hc probe on the given server. */
static boolean_t
ilbd_run_probe(ilbd_hc_srv_t *srv)
{
        posix_spawn_file_actions_t      fd_actions;
        boolean_t                       init_fd_actions = B_FALSE;
        posix_spawnattr_t               attr;
        boolean_t                       init_attr = B_FALSE;
        sigset_t                        child_sigset;
        int                             fds[2];
        int                             fdflags;
        pid_t                           pid;
        char                            *child_argv[HC_PROBE_ARGC];
        ilbd_hc_probe_event_t           *probe_ev;
        char                            *probe_name;

        bzero(child_argv, HC_PROBE_ARGC * sizeof (char *));
        if ((probe_ev = calloc(1, sizeof (*probe_ev))) == NULL) {
                logdebug("ilbd_run_probe: calloc");
                return (B_FALSE);
        }

        /* Set up a pipe to get output from probe command. */
        if (pipe(fds) < 0) {
                logdebug("ilbd_run_probe: cannot create pipe");
                free(probe_ev);
                return (B_FALSE);
        }
        /* Set our side of the pipe to be non-blocking */
        if ((fdflags = fcntl(fds[0], F_GETFL, 0)) == -1) {
                logdebug("ilbd_run_probe: fcntl(F_GETFL)");
                goto cleanup;
        }
        if (fcntl(fds[0], F_SETFL, fdflags | O_NONBLOCK) == -1) {
                logdebug("ilbd_run_probe: fcntl(F_SETFL)");
                goto cleanup;
        }

        if (posix_spawn_file_actions_init(&fd_actions) != 0) {
                logdebug("ilbd_run_probe: posix_spawn_file_actions_init");
                goto cleanup;
        }
        init_fd_actions = B_TRUE;
        if (posix_spawnattr_init(&attr) != 0) {
                logdebug("ilbd_run_probe: posix_spawnattr_init");
                goto cleanup;
        }
        init_attr = B_TRUE;
        if (posix_spawn_file_actions_addclose(&fd_actions, fds[0]) != 0) {
                logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
                goto cleanup;
        }
        if (posix_spawn_file_actions_adddup2(&fd_actions, fds[1],
            STDOUT_FILENO) != 0) {
                logdebug("ilbd_run_probe: posix_spawn_file_actions_dup2");
                goto cleanup;
        }
        if (posix_spawn_file_actions_addclose(&fd_actions, fds[1]) != 0) {
                logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
                goto cleanup;
        }

        /* Reset all signal handling of the child to default. */
        (void) sigfillset(&child_sigset);
        if (posix_spawnattr_setsigdefault(&attr, &child_sigset) != 0) {
                logdebug("ilbd_run_probe: posix_spawnattr_setsigdefault");
                goto cleanup;
        }
        /* Don't want SIGCHLD. */
        if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_NOSIGCHLD_NP|
            POSIX_SPAWN_SETSIGDEF) != 0) {
                logdebug("ilbd_run_probe: posix_spawnattr_setflags");
                goto cleanup;
        }

        if (!create_argv(srv, child_argv)) {
                logdebug("ilbd_run_probe: create_argv");
                goto cleanup;
        }

        /*
         * If we are doing default pinging or not using a user supplied
         * probe, we should execute our standard supplied probe.  The
         * supplied probe command handles all types of probes.  And the
         * type used depends on argv[0], as filled in by create_argv().
         */
        if (srv->shc_state == ilbd_hc_def_pinging ||
            srv->shc_hc->ihc_test_type != ILBD_HC_USER) {
                probe_name = ILB_PROBE_PROTO;
        } else {
                probe_name = srv->shc_hc->ihc_test;
        }
        if (posix_spawn(&pid, probe_name, &fd_actions, &attr, child_argv,
            NULL) != 0) {
                logerr("%s: posix_spawn: %s for server %s: %s", __func__,
                    srv->shc_hc->ihc_test, srv->shc_sg_srv->sgs_srvID,
                    strerror(errno));
                goto cleanup;
        }

        (void) close(fds[1]);
        srv->shc_child_pid = pid;
        srv->shc_child_fd = fds[0];
        srv->shc_ev = probe_ev;

        probe_ev->ihp_ev = ILBD_EVENT_PROBE;
        probe_ev->ihp_srv = srv;
        probe_ev->ihp_pid = pid;
        if (port_associate(srv->shc_ev_port, PORT_SOURCE_FD, fds[0],
            POLLRDNORM, probe_ev) != 0) {
                /*
                 * Need to kill the child.  It will free the srv->shc_ev,
                 * which is probe_ev.  So set probe_ev to NULL.
                 */
                ilbd_hc_kill_probe(srv);
                probe_ev = NULL;
                goto cleanup;
        }

        destroy_argv(child_argv);
        (void) posix_spawn_file_actions_destroy(&fd_actions);
        (void) posix_spawnattr_destroy(&attr);
        return (B_TRUE);

cleanup:
        destroy_argv(child_argv);
        if (init_fd_actions == B_TRUE)
                (void) posix_spawn_file_actions_destroy(&fd_actions);
        if (init_attr == B_TRUE)
                (void) posix_spawnattr_destroy(&attr);
        (void) close(fds[0]);
        (void) close(fds[1]);
        if (probe_ev != NULL)
                free(probe_ev);
        return (B_FALSE);
}

/*
 * Called by ild_hc_probe_return() to re-associate the fd to a child to
 * the event port.
 */
static void
reassociate_port(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
{
        if (port_associate(ev_port, PORT_SOURCE_FD, fd,
            POLLRDNORM, ev) != 0) {
                /*
                 * If we cannot reassociate with the port, the only
                 * thing we can do now is to kill the child and
                 * do a blocking wait here...
                 */
                logdebug("%s: port_associate: %s", __func__, strerror(errno));
                if (kill(ev->ihp_pid, SIGKILL) != 0)
                        logerr("%s: kill: %s", __func__, strerror(errno));
                if (waitpid(ev->ihp_pid, NULL, 0) != ev->ihp_pid)
                        logdebug("%s: waitpid: %s", __func__, strerror(errno));
                free(ev);
        }
}

/*
 * To handle a child probe process hanging up.
 */
static void
ilbd_hc_child_hup(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
{
        ilbd_hc_srv_t *srv;
        pid_t ret_pid;
        int ret;

        srv = ev->ihp_srv;

        if (!ev->ihp_done) {
                /* ilbd does not care about this process anymore ... */
                ev->ihp_done = B_TRUE;
                srv->shc_ev = NULL;
                srv->shc_child_pid = 0;
                HC_CANCEL_TIMER(srv);
                ilbd_set_fail_state(srv);
        }
        ret_pid = waitpid(ev->ihp_pid, &ret, WNOHANG);
        switch (ret_pid) {
        case -1:
                logperror("ilbd_hc_child_hup: waitpid");
                /* FALLTHROUGH */
        case 0:
                /* The child has not completed the exit. Wait again. */
                reassociate_port(ev_port, fd, ev);
                break;
        default:
                /* Right now, we just ignore the exit status. */
                if (WIFEXITED(ret))
                        ret = WEXITSTATUS(ret);
                (void) close(fd);
                free(ev);
        }
}

/*
 * To read the output of a child probe process.
 */
static void
ilbd_hc_child_data(int fd, ilbd_hc_probe_event_t *ev)
{
        ilbd_hc_srv_t *srv;
        char buf[HC_MAX_PROBE_OUTPUT];
        int ret;
        int64_t rtt;

        srv = ev->ihp_srv;

        bzero(buf, HC_MAX_PROBE_OUTPUT);
        ret = read(fd, buf, HC_MAX_PROBE_OUTPUT - 1);
        /* Should not happen since event port should have caught this. */
        assert(ret > 0);

        /*
         * We expect the probe command to print out the RTT only.  But
         * the command may misbehave and print out more than what we intend to
         * read in.  So need to do this check below to "flush" out all the
         * output from the command.
         */
        if (!ev->ihp_done) {
                ev->ihp_done = B_TRUE;
                /* We don't need to know about this event anymore. */
                srv->shc_ev = NULL;
                srv->shc_child_pid = 0;
                HC_CANCEL_TIMER(srv);
        } else {
                return;
        }

        rtt = strtoll(buf, NULL, 10);

        /*
         * -1 means the server is dead or the probe somehow fails.  Treat
         * them both as server is dead.
         */
        if (rtt == -1) {
                ilbd_set_fail_state(srv);
                return;
        } else if (rtt > 0) {
                /* If the returned RTT value is not valid, just ignore it. */
                if (rtt > 0 && rtt <= UINT_MAX) {
                        /* Set rtt to be the simple smoothed average. */
                        if (srv->shc_rtt == 0) {
                                srv->shc_rtt = rtt;
                        } else {
                                srv->shc_rtt = 3 * ((srv)->shc_rtt >> 2) +
                                    (rtt >> 2);
                        }
                }

        }

        switch (srv->shc_state) {
        case ilbd_hc_def_pinging:
                srv->shc_state = ilbd_hc_probing;

                /* Ping is OK, now start the probe. */
                ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
                break;
        case ilbd_hc_probing:
                srv->shc_fail_cnt = 0;

                /* Server is dead before, re-enable it. */
                if (srv->shc_status == ILB_HCS_UNREACH ||
                    srv->shc_status == ILB_HCS_DEAD) {
                        /*
                         * If enabling the server in kernel fails now,
                         * hopefully when the timer fires again later, the
                         * enabling can be done.
                         */
                        if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
                            srv->shc_hc_rule->hcr_rule->irl_name,
                            stat_declare_srv_alive) != ILB_STATUS_OK) {
                                logerr("%s: cannot enable server in kernel: "
                                    " rule %s server %s", __func__,
                                    srv->shc_hc_rule->hcr_rule->irl_name,
                                    srv->shc_sg_srv->sgs_srvID);
                        } else {
                                srv->shc_status = ILB_HCS_ALIVE;
                        }
                } else {
                        srv->shc_status = ILB_HCS_ALIVE;
                }
                if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
                        logerr("%s: cannot restart timer: rule %s server %s",
                            __func__, srv->shc_hc_rule->hcr_rule->irl_name,
                            srv->shc_sg_srv->sgs_srvID);
                        ilbd_mark_server_disabled(srv);
                }
                break;
        default:
                logdebug("%s: unknown state", __func__);
                break;
        }
}

/*
 * Handle the return event of a child probe fd.
 */
void
ilbd_hc_probe_return(int ev_port, int fd, int port_events,
    ilbd_hc_probe_event_t *ev)
{
        /*
         * Note that there can be more than one events delivered to us at
         * the same time.  So we need to check them individually.
         */
        if (port_events & POLLRDNORM)
                ilbd_hc_child_data(fd, ev);

        if (port_events & (POLLHUP|POLLERR)) {
                ilbd_hc_child_hup(ev_port, fd, ev);
                return;
        }

        /*
         * Re-associate the fd with the port so that when the child
         * exits, we can reap the status.
         */
        reassociate_port(ev_port, fd, ev);
}