root/usr/src/uts/common/inet/ilb/ilb.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/ksynch.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <sys/disp.h>
#include <sys/taskq.h>
#include <sys/cmn_err.h>
#include <sys/strsun.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <netinet/in.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <inet/udp_impl.h>
#include <inet/kstatcom.h>

#include <inet/ilb_ip.h>
#include "ilb_alg.h"
#include "ilb_nat.h"
#include "ilb_conn.h"

/* ILB kmem cache flag */
int ilb_kmem_flags = 0;

/*
 * The default size for the different hash tables.  Global for all stacks.
 * But each stack has its own table, just that their sizes are the same.
 */
static size_t ilb_rule_hash_size = 2048;

static size_t ilb_conn_hash_size = 262144;

static size_t ilb_sticky_hash_size = 262144;

/* This should be a prime number. */
static size_t ilb_nat_src_hash_size = 97;

/* Default NAT cache entry expiry time. */
static uint32_t ilb_conn_tcp_expiry = 120;
static uint32_t ilb_conn_udp_expiry = 60;

/* Default sticky entry expiry time. */
static uint32_t ilb_sticky_expiry = 60;

/* addr is assumed to be a uint8_t * to an ipaddr_t. */
#define ILB_RULE_HASH(addr, hash_size) \
        ((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
        *(addr)) & ((hash_size) - 1))

/*
 * Note on ILB delayed processing
 *
 * To avoid in line removal on some of the data structures, such as rules,
 * servers and ilb_conn_hash entries, ILB delays such processing to a taskq.
 * There are three types of ILB taskq:
 *
 * 1. rule handling: created at stack initialialization time, ilb_stack_init()
 * 2. conn hash handling: created at conn hash initialization time,
 *                        ilb_conn_hash_init()
 * 3. sticky hash handling: created at sticky hash initialization time,
 *                          ilb_sticky_hash_init()
 *
 * The rule taskq is for processing rule and server removal.  When a user
 * land rule/server removal request comes in, a taskq is dispatched after
 * removing the rule/server from all related hashes.  This taskq will wait
 * until all references to the rule/server are gone before removing it.
 * So the user land thread requesting the removal does not need to wait
 * for the removal completion.
 *
 * The conn hash/sticky hash taskq is for processing ilb_conn_hash and
 * ilb_sticky_hash table entry removal.  There are ilb_conn_timer_size timers
 * and ilb_sticky_timer_size timers running for ilb_conn_hash and
 * ilb_sticky_hash cleanup respectively.   Each timer is responsible for one
 * portion (same size) of the hash table.  When a timer fires, it dispatches
 * a conn hash taskq to clean up its portion of the table.  This avoids in
 * line processing of the removal.
 *
 * There is another delayed processing, the clean up of NAT source address
 * table.  We just use the timer to directly handle it instead of using
 * a taskq.  The reason is that the table is small so it is OK to use the
 * timer.
 */

/* ILB rule taskq constants. */
#define ILB_RULE_TASKQ_NUM_THR  20

/* Argument passed to ILB rule taskq routines. */
typedef struct {
        ilb_stack_t     *ilbs;
        ilb_rule_t      *rule;
} ilb_rule_tq_t;

/* kstat handling routines. */
static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
    ilb_server_t *);

/* Rule hash handling routines. */
static void ilb_rule_hash_init(ilb_stack_t *);
static void ilb_rule_hash_fini(ilb_stack_t *);
static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
static void ilb_rule_hash_del(ilb_rule_t *);
static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
    in_port_t, zoneid_t, uint32_t, boolean_t *);

static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
    int *);
static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
    int, in_port_t, in_port_t, const in6_addr_t *);

/* Back end server handling routines. */
static void ilb_server_free(ilb_server_t *);

/* Network stack handling routines. */
static void *ilb_stack_init(netstackid_t, netstack_t *);
static void ilb_stack_shutdown(netstackid_t, void *);
static void ilb_stack_fini(netstackid_t, void *);

/* Sticky connection handling routines. */
static void ilb_rule_sticky_init(ilb_rule_t *);
static void ilb_rule_sticky_fini(ilb_rule_t *);

/* Handy macro to check for unspecified address. */
#define IS_ADDR_UNSPEC(addr)                                            \
        (IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) :  \
            IN6_IS_ADDR_UNSPECIFIED(addr))

/*
 * Global kstat instance counter.  When a rule is created, its kstat instance
 * number is assigned by ilb_kstat_instance and ilb_kstat_instance is
 * incremented.
 */
static uint_t ilb_kstat_instance = 0;

/*
 * The ILB global kstat has name ILB_G_KS_NAME and class name ILB_G_KS_CNAME.
 * A rule's kstat has ILB_RULE_KS_CNAME class name.
 */
#define ILB_G_KS_NAME           "global"
#define ILB_G_KS_CNAME          "kstat"
#define ILB_RULE_KS_CNAME       "rulestat"

static kstat_t *
ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
{
        kstat_t *ksp;
        ilb_g_kstat_t template = {
                { "num_rules",          KSTAT_DATA_UINT64, 0 },
                { "ip_frag_in",         KSTAT_DATA_UINT64, 0 },
                { "ip_frag_dropped",    KSTAT_DATA_UINT64, 0 }
        };

        ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
            ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
            KSTAT_FLAG_VIRTUAL, stackid);
        if (ksp == NULL)
                return (NULL);
        bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
        ksp->ks_data = ilbs->ilbs_kstat;
        ksp->ks_private = (void *)(uintptr_t)stackid;

        kstat_install(ksp);
        return (ksp);
}

static void
ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
{
        if (ilbs->ilbs_ksp != NULL) {
                ASSERT(stackid == (netstackid_t)(uintptr_t)
                    ilbs->ilbs_ksp->ks_private);
                kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
                ilbs->ilbs_ksp = NULL;
        }
}

static kstat_t *
ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
{
        kstat_t *ksp;
        ilb_rule_kstat_t template = {
                { "num_servers",                KSTAT_DATA_UINT64, 0 },
                { "bytes_not_processed",        KSTAT_DATA_UINT64, 0 },
                { "pkt_not_processed",          KSTAT_DATA_UINT64, 0 },
                { "bytes_dropped",              KSTAT_DATA_UINT64, 0 },
                { "pkt_dropped",                KSTAT_DATA_UINT64, 0 },
                { "nomem_bytes_dropped",        KSTAT_DATA_UINT64, 0 },
                { "nomem_pkt_dropped",          KSTAT_DATA_UINT64, 0 },
                { "noport_bytes_dropped",       KSTAT_DATA_UINT64, 0 },
                { "noport_pkt_dropped",         KSTAT_DATA_UINT64, 0 },
                { "icmp_echo_processed",        KSTAT_DATA_UINT64, 0 },
                { "icmp_dropped",               KSTAT_DATA_UINT64, 0 },
                { "icmp_too_big_processed",     KSTAT_DATA_UINT64, 0 },
                { "icmp_too_big_dropped",       KSTAT_DATA_UINT64, 0 }
        };

        ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
            rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
            NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
        if (ksp == NULL)
                return (NULL);

        bcopy(&template, &rule->ir_kstat, sizeof (template));
        ksp->ks_data = &rule->ir_kstat;
        ksp->ks_private = (void *)(uintptr_t)stackid;

        kstat_install(ksp);
        return (ksp);
}

static kstat_t *
ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
    ilb_server_t *server)
{
        kstat_t *ksp;
        ilb_server_kstat_t template = {
                { "bytes_processed",    KSTAT_DATA_UINT64, 0 },
                { "pkt_processed",      KSTAT_DATA_UINT64, 0 },
                { "ip_address",         KSTAT_DATA_STRING, 0 }
        };
        char cname_buf[KSTAT_STRLEN];

        /* 7 is "-sstat" */
        ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
        (void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
        ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
            server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
            NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
        if (ksp == NULL)
                return (NULL);

        bcopy(&template, &server->iser_kstat, sizeof (template));
        ksp->ks_data = &server->iser_kstat;
        ksp->ks_private = (void *)(uintptr_t)stackid;

        kstat_named_setstr(&server->iser_kstat.ip_address,
            server->iser_ip_addr);
        /* We never change the IP address */
        ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;

        kstat_install(ksp);
        return (ksp);
}

/* Initialize the rule hash table. */
static void
ilb_rule_hash_init(ilb_stack_t *ilbs)
{
        int i;

        /*
         * If ilbs->ilbs_rule_hash_size is not a power of 2, bump it up to
         * the next power of 2.
         */
        if (!ISP2(ilbs->ilbs_rule_hash_size)) {
                for (i = 0; i < 31; i++) {
                        if (ilbs->ilbs_rule_hash_size < (1 << i))
                                break;
                }
                ilbs->ilbs_rule_hash_size = 1 << i;
        }
        ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
            ilbs->ilbs_rule_hash_size, KM_SLEEP);
        for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
                mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
                    MUTEX_DEFAULT, NULL);
        }
}

/* Clean up the rule hash table. */
static void
ilb_rule_hash_fini(ilb_stack_t *ilbs)
{
        if (ilbs->ilbs_g_hash == NULL)
                return;
        kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
            ilbs->ilbs_rule_hash_size);
}

/* Add a rule to the rule hash table. */
static void
ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
{
        int i;

        i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
            ilbs->ilbs_rule_hash_size);
        DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
        mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
        if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
                ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
        rule->ir_hash_prev = NULL;
        ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;

        rule->ir_hash = &ilbs->ilbs_g_hash[i];
        mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
}

/*
 * Remove a rule from the rule hash table.  Note that the rule is not freed
 * in this routine.
 */
static void
ilb_rule_hash_del(ilb_rule_t *rule)
{
        mutex_enter(&rule->ir_hash->ilb_hash_lock);
        if (rule->ir_hash->ilb_hash_rule == rule) {
                rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
                if (rule->ir_hash_next != NULL)
                        rule->ir_hash_next->ir_hash_prev = NULL;
        } else {
                if (rule->ir_hash_prev != NULL)
                        rule->ir_hash_prev->ir_hash_next =
                            rule->ir_hash_next;
                if (rule->ir_hash_next != NULL) {
                        rule->ir_hash_next->ir_hash_prev =
                            rule->ir_hash_prev;
                }
        }
        mutex_exit(&rule->ir_hash->ilb_hash_lock);

        rule->ir_hash_next = NULL;
        rule->ir_hash_prev = NULL;
        rule->ir_hash = NULL;
}

/*
 * Given the info of a packet, look for a match in the rule hash table.
 */
static ilb_rule_t *
ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
    in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
{
        int i;
        ilb_rule_t *rule;
        ipaddr_t v4_addr;

        *busy = B_FALSE;
        IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
        i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
        port = ntohs(port);

        mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
            rule = rule->ir_hash_next) {
                if (!rule->ir_port_range) {
                        if (rule->ir_min_port != port)
                                continue;
                } else {
                        if (port < rule->ir_min_port ||
                            port > rule->ir_max_port) {
                                continue;
                        }
                }
                if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
                    rule->ir_zoneid != zoneid) {
                        continue;
                }

                if (l3 == IPPROTO_IP) {
                        if (rule->ir_target_v4 != INADDR_ANY &&
                            rule->ir_target_v4 != v4_addr) {
                                continue;
                        }
                } else {
                        if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
                            !IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
                                continue;
                        }
                }

                /*
                 * Just update the stats if the rule is disabled.
                 */
                mutex_enter(&rule->ir_lock);
                if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
                        ILB_R_KSTAT(rule, pkt_not_processed);
                        ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
                        mutex_exit(&rule->ir_lock);
                        rule = NULL;
                        break;
                } else if (rule->ir_flags & ILB_RULE_BUSY) {
                        /*
                         * If we are busy...
                         *
                         * XXX we should have a queue to postpone the
                         * packet processing.  But this requires a
                         * mechanism in IP to re-start the packet
                         * processing.  So for now, just drop the packet.
                         */
                        ILB_R_KSTAT(rule, pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
                        mutex_exit(&rule->ir_lock);
                        *busy = B_TRUE;
                        rule = NULL;
                        break;
                } else {
                        rule->ir_refcnt++;
                        ASSERT(rule->ir_refcnt != 1);
                        mutex_exit(&rule->ir_lock);
                        break;
                }
        }
        mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        return (rule);
}

/*
 * Add a rule to the global rule list.  This list is for finding all rules
 * in an IP stack.  The caller is assumed to hold the ilbs_g_lock.
 */
static void
ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
{
        ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
        rule->ir_next = ilbs->ilbs_rule_head;
        ilbs->ilbs_rule_head = rule;
        ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
}

/* The call is assumed to hold the ilbs_g_lock. */
static void
ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
{
        ilb_rule_t *tmp_rule;
        ilb_rule_t *prev_rule;

        ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
        prev_rule = NULL;
        for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
            prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
                if (tmp_rule == rule)
                        break;
        }
        if (tmp_rule == NULL) {
                mutex_exit(&ilbs->ilbs_g_lock);
                return;
        }
        if (prev_rule == NULL)
                ilbs->ilbs_rule_head = tmp_rule->ir_next;
        else
                prev_rule->ir_next = tmp_rule->ir_next;
        ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
}

/*
 * Helper routine to calculate how many source addresses are in a given
 * range.
 */
static int64_t
num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
{
        int64_t ret;
        uint32_t addr1, addr2;

        /*
         * Here we assume that the max number of NAT source cannot be
         * large such that the most significant 2 s6_addr32 must be
         * equal.
         */
        addr1 = ntohl(a1->s6_addr32[3]);
        addr2 = ntohl(a2->s6_addr32[3]);
        if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
            a1->s6_addr32[1] != a2->s6_addr32[1] ||
            a1->s6_addr32[2] > a2->s6_addr32[2] ||
            (a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
                return (-1);
        }
        if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
                return (addr2 - addr1 + 1);
        } else {
                ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
                ret <<= 32;
                ret = ret + addr1 - addr2;
                return (ret + 1);
        }
}

/*
 * Add an ILB rule.
 */
int
ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
{
        ilb_rule_t *rule;
        netstackid_t stackid;
        int ret;
        in_port_t min_port, max_port;
        int64_t num_src;

        /* Sanity checks. */
        if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
                return (EINVAL);

        /* Need to support SCTP... */
        if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
                return (EINVAL);

        /* For full NAT, the NAT source must be supplied. */
        if (cmd->topo == ILB_TOPO_IMPL_NAT) {
                if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
                    IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
                        return (EINVAL);
                }
        }

        /* Check invalid mask */
        if ((cmd->flags & ILB_RULE_STICKY) &&
            IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
                return (EINVAL);
        }

        /* Port is passed in network byte order. */
        min_port = ntohs(cmd->min_port);
        max_port = ntohs(cmd->max_port);
        if (min_port > max_port)
                return (EINVAL);

        /* min_port == 0 means "all ports". Make it so */
        if (min_port == 0) {
                min_port = 1;
                max_port = 65535;
        }

        /* Funny address checking. */
        if (cmd->ip_ver == IPPROTO_IP) {
                in_addr_t v4_addr1, v4_addr2;

                v4_addr1 = cmd->vip.s6_addr32[3];
                if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
                    CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
                    v4_addr1 == INADDR_ANY ||
                    !IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
                        return (EINVAL);
                }

                if (cmd->topo == ILB_TOPO_IMPL_NAT) {
                        v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
                        v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
                        if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
                            (*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
                            v4_addr1 == INADDR_BROADCAST ||
                            v4_addr2 == INADDR_BROADCAST ||
                            v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
                            CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
                            !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
                            !IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
                                return (EINVAL);
                        }

                        num_src = v4_addr2 - v4_addr1 + 1;
                        if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
                                return (EINVAL);
                }
        } else {
                if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
                    IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
                    IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
                    IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
                        return (EINVAL);
                }

                if (cmd->topo == ILB_TOPO_IMPL_NAT) {
                        if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
                            IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
                            IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
                            IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
                            IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
                            IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
                            IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
                            IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
                                return (EINVAL);
                        }

                        if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
                            &cmd->nat_src_end)) < 0 ||
                            num_src > ILB_MAX_NAT_SRC) {
                                return (EINVAL);
                        }
                }
        }

        mutex_enter(&ilbs->ilbs_g_lock);
        if (ilbs->ilbs_g_hash == NULL)
                ilb_rule_hash_init(ilbs);
        if (ilbs->ilbs_c2s_conn_hash == NULL) {
                ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
                ilb_conn_hash_init(ilbs);
                ilb_nat_src_init(ilbs);
        }

        /* Make sure that the new rule does not duplicate an existing one. */
        if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
            min_port, max_port, &cmd->vip)) {
                mutex_exit(&ilbs->ilbs_g_lock);
                return (EEXIST);
        }

        rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
        if (rule == NULL) {
                mutex_exit(&ilbs->ilbs_g_lock);
                return (ENOMEM);
        }

        /* ir_name is all 0 to begin with */
        (void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);

        rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
        stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
        if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
                ret = ENOMEM;
                goto error;
        }

        if (cmd->topo == ILB_TOPO_IMPL_NAT) {
                rule->ir_nat_src_start = cmd->nat_src_start;
                rule->ir_nat_src_end = cmd->nat_src_end;
        }

        rule->ir_ipver = cmd->ip_ver;
        rule->ir_proto = cmd->proto;
        rule->ir_topo = cmd->topo;

        rule->ir_min_port = min_port;
        rule->ir_max_port = max_port;
        if (rule->ir_min_port != rule->ir_max_port)
                rule->ir_port_range = B_TRUE;
        else
                rule->ir_port_range = B_FALSE;

        rule->ir_zoneid = zoneid;

        rule->ir_target_v6 = cmd->vip;
        rule->ir_servers = NULL;

        /*
         * The default connection drain timeout is indefinite (value 0),
         * meaning we will wait for all connections to finish.  So we
         * can assign cmd->conn_drain_timeout to it directly.
         */
        rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
        if (cmd->nat_expiry != 0) {
                rule->ir_nat_expiry = cmd->nat_expiry;
        } else {
                switch (rule->ir_proto) {
                case IPPROTO_TCP:
                        rule->ir_nat_expiry = ilb_conn_tcp_expiry;
                        break;
                case IPPROTO_UDP:
                        rule->ir_nat_expiry = ilb_conn_udp_expiry;
                        break;
                default:
                        cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
                            (void *)rule);
                        break;
                }
        }
        if (cmd->sticky_expiry != 0)
                rule->ir_sticky_expiry = cmd->sticky_expiry;
        else
                rule->ir_sticky_expiry = ilb_sticky_expiry;

        if (cmd->flags & ILB_RULE_STICKY) {
                rule->ir_flags |= ILB_RULE_STICKY;
                rule->ir_sticky_mask = cmd->sticky_mask;
                if (ilbs->ilbs_sticky_hash == NULL)
                        ilb_sticky_hash_init(ilbs);
        }
        if (cmd->flags & ILB_RULE_ENABLED)
                rule->ir_flags |= ILB_RULE_ENABLED;

        mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);

        rule->ir_refcnt = 1;

        switch (cmd->algo) {
        case ILB_ALG_IMPL_ROUNDROBIN:
                if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
                        ret = ENOMEM;
                        goto error;
                }
                rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
                break;
        case ILB_ALG_IMPL_HASH_IP:
        case ILB_ALG_IMPL_HASH_IP_SPORT:
        case ILB_ALG_IMPL_HASH_IP_VIP:
                if ((rule->ir_alg = ilb_alg_hash_init(rule,
                    &cmd->algo)) == NULL) {
                        ret = ENOMEM;
                        goto error;
                }
                rule->ir_alg_type = cmd->algo;
                break;
        default:
                ret = EINVAL;
                goto error;
        }

        /* Add it to the global list and hash array at the end. */
        ilb_rule_g_add(ilbs, rule);
        ilb_rule_hash_add(ilbs, rule, &cmd->vip);

        mutex_exit(&ilbs->ilbs_g_lock);

        return (0);

error:
        mutex_exit(&ilbs->ilbs_g_lock);
        if (rule->ir_ksp != NULL) {
                /* stackid must be initialized if ir_ksp != NULL */
                kstat_delete_netstack(rule->ir_ksp, stackid);
        }
        kmem_free(rule, sizeof (ilb_rule_t));
        return (ret);
}

/*
 * The final part in deleting a rule.  Either called directly or by the
 * taskq dispatched.
 */
static void
ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
{
        netstackid_t stackid;
        ilb_server_t *server;

        stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;

        /*
         * Let the algorithm know that the rule is going away.  The
         * algorithm fini routine will free all its resources with this
         * rule.
         */
        tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);

        while ((server = tmp_rule->ir_servers) != NULL) {
                mutex_enter(&server->iser_lock);
                ilb_destroy_nat_src(&server->iser_nat_src);
                if (tmp_rule->ir_conn_drain_timeout != 0) {
                        /*
                         * The garbage collection thread checks this value
                         * without grabing a lock.  So we need to use
                         * atomic_swap_64() to make sure that the value seen
                         * by gc thread is intact.
                         */
                        (void) atomic_swap_64(
                            (uint64_t *)&server->iser_die_time,
                            ddi_get_lbolt64() +
                            SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
                }
                while (server->iser_refcnt > 1)
                        cv_wait(&server->iser_cv, &server->iser_lock);
                tmp_rule->ir_servers = server->iser_next;
                kstat_delete_netstack(server->iser_ksp, stackid);
                kmem_free(server, sizeof (ilb_server_t));
        }

        ASSERT(tmp_rule->ir_ksp != NULL);
        kstat_delete_netstack(tmp_rule->ir_ksp, stackid);

        kmem_free(tmp_rule, sizeof (ilb_rule_t));
}

/* The routine executed by the delayed rule taskq. */
static void
ilb_rule_del_tq(void *arg)
{
        ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
        ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;

        mutex_enter(&rule->ir_lock);
        while (rule->ir_refcnt > 1)
                cv_wait(&rule->ir_cv, &rule->ir_lock);
        ilb_rule_del_common(ilbs, rule);
        kmem_free(arg, sizeof (ilb_rule_tq_t));
}

/* Routine to delete a rule. */
int
ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
{
        ilb_rule_t *tmp_rule;
        ilb_rule_tq_t *arg;
        int err;

        mutex_enter(&ilbs->ilbs_g_lock);
        if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
            &err)) == NULL) {
                mutex_exit(&ilbs->ilbs_g_lock);
                return (err);
        }

        /*
         * First remove the rule from the hash array and the global list so
         * that no one can find this rule any more.
         */
        ilb_rule_hash_del(tmp_rule);
        ilb_rule_g_del(ilbs, tmp_rule);
        mutex_exit(&ilbs->ilbs_g_lock);
        ILB_RULE_REFRELE(tmp_rule);

        /*
         * Now no one can find this rule, we can remove it once all
         * references to it are dropped and all references to the list
         * of servers are dropped.  So dispatch a task to finish the deletion.
         * We do this instead of letting the last one referencing the
         * rule do it.  The reason is that the last one may be the
         * interrupt thread.  We want to minimize the work it needs to
         * do.  Rule deletion is not a critical task so it can be delayed.
         */
        arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
        arg->ilbs = ilbs;
        arg->rule = tmp_rule;
        (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
            TQ_SLEEP);

        return (0);
}

/*
 * Given an IP address, check to see if there is a rule using this
 * as the VIP.  It can be used to check if we need to drop a fragment.
 */
boolean_t
ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
{
        int i;
        ilb_rule_t *rule;
        boolean_t ret = B_FALSE;

        i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
            ilbs->ilbs_rule_hash_size);
        mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
            rule = rule->ir_hash_next) {
                if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
                        mutex_enter(&rule->ir_lock);
                        if (rule->ir_flags & ILB_RULE_BUSY) {
                                mutex_exit(&rule->ir_lock);
                                break;
                        }
                        if (ret_rule != NULL) {
                                rule->ir_refcnt++;
                                mutex_exit(&rule->ir_lock);
                                *ret_rule = rule;
                        } else {
                                mutex_exit(&rule->ir_lock);
                        }
                        ret = B_TRUE;
                        break;
                }
        }
        mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        return (ret);
}

boolean_t
ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
{
        int i;
        ilb_rule_t *rule;
        boolean_t ret = B_FALSE;

        i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
        mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
            rule = rule->ir_hash_next) {
                if (rule->ir_target_v6.s6_addr32[3] == addr) {
                        mutex_enter(&rule->ir_lock);
                        if (rule->ir_flags & ILB_RULE_BUSY) {
                                mutex_exit(&rule->ir_lock);
                                break;
                        }
                        if (ret_rule != NULL) {
                                rule->ir_refcnt++;
                                mutex_exit(&rule->ir_lock);
                                *ret_rule = rule;
                        } else {
                                mutex_exit(&rule->ir_lock);
                        }
                        ret = B_TRUE;
                        break;
                }
        }
        mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
        return (ret);
}

static ilb_rule_t *
ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
    int *err)
{
        ilb_rule_t *tmp_rule;

        ASSERT(mutex_owned(&ilbs->ilbs_g_lock));

        for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
            tmp_rule = tmp_rule->ir_next) {
                if (tmp_rule->ir_zoneid != zoneid)
                        continue;
                if (strcasecmp(tmp_rule->ir_name, name) == 0) {
                        mutex_enter(&tmp_rule->ir_lock);
                        if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
                                mutex_exit(&tmp_rule->ir_lock);
                                *err = EINPROGRESS;
                                return (NULL);
                        }
                        tmp_rule->ir_refcnt++;
                        mutex_exit(&tmp_rule->ir_lock);
                        *err = 0;
                        return (tmp_rule);
                }
        }
        *err = ENOENT;
        return (NULL);
}

/* To find a rule with a given name and zone in the global rule list. */
ilb_rule_t *
ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
    int *err)
{
        ilb_rule_t *tmp_rule;

        mutex_enter(&ilbs->ilbs_g_lock);
        tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
        mutex_exit(&ilbs->ilbs_g_lock);
        return (tmp_rule);
}

/* Try to match the given packet info and zone ID with a rule. */
static boolean_t
ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
    int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
{
        ilb_rule_t *tmp_rule;

        ASSERT(mutex_owned(&ilbs->ilbs_g_lock));

        for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
            tmp_rule = tmp_rule->ir_next) {
                if (tmp_rule->ir_zoneid != zoneid)
                        continue;

                /*
                 * We don't allow the same name in different rules even if all
                 * the other rule components are different.
                 */
                if (strcasecmp(tmp_rule->ir_name, name) == 0)
                        return (B_TRUE);

                if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
                        continue;

                /*
                 * ir_min_port and ir_max_port are the same if ir_port_range
                 * is false.  In this case, if the ir_min|max_port (same) is
                 * outside of the given port range, it is OK.  In other cases,
                 * check if min and max port are outside a rule's range.
                 */
                if (tmp_rule->ir_max_port < min_port ||
                    tmp_rule->ir_min_port > max_port) {
                        continue;
                }

                /*
                 * If l3 is IPv4, the addr passed in is assumed to be
                 * mapped address.
                 */
                if (V6_OR_V4_INADDR_ANY(*addr) ||
                    V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
                    IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
                        return (B_TRUE);
                }
        }
        return (B_FALSE);
}

int
ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
    const char *rule_name, ilb_rule_t *in_rule)
{
        ilb_rule_t *rule;
        int err;

        ASSERT((in_rule == NULL && rule_name != NULL) ||
            (in_rule != NULL && rule_name == NULL));
        if ((rule = in_rule) == NULL) {
                if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
                    &err)) == NULL) {
                        return (err);
                }
        }
        mutex_enter(&rule->ir_lock);
        rule->ir_flags |= ILB_RULE_ENABLED;
        mutex_exit(&rule->ir_lock);

        /* Only refrele if the rule is passed in. */
        if (in_rule == NULL)
                ILB_RULE_REFRELE(rule);
        return (0);
}

int
ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
    const char *rule_name, ilb_rule_t *in_rule)
{
        ilb_rule_t *rule;
        int err;

        ASSERT((in_rule == NULL && rule_name != NULL) ||
            (in_rule != NULL && rule_name == NULL));
        if ((rule = in_rule) == NULL) {
                if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
                    &err)) == NULL) {
                        return (err);
                }
        }
        mutex_enter(&rule->ir_lock);
        rule->ir_flags &= ~ILB_RULE_ENABLED;
        mutex_exit(&rule->ir_lock);

        /* Only refrele if the rule is passed in. */
        if (in_rule == NULL)
                ILB_RULE_REFRELE(rule);
        return (0);
}

/*
 * XXX We should probably have a walker function to walk all rules.  For
 * now, just add a simple loop for enable/disable/del.
 */
void
ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
        ilb_rule_t *rule;

        mutex_enter(&ilbs->ilbs_g_lock);
        for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
                if (rule->ir_zoneid != zoneid)
                        continue;
                /*
                 * No need to hold the rule as we are holding the global
                 * lock so it won't go away.  Ignore the return value here
                 * as the rule is provided so the call cannot fail.
                 */
                (void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
        }
        mutex_exit(&ilbs->ilbs_g_lock);
}

void
ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
        ilb_rule_t *rule;

        mutex_enter(&ilbs->ilbs_g_lock);
        for (rule = ilbs->ilbs_rule_head; rule != NULL;
            rule = rule->ir_next) {
                if (rule->ir_zoneid != zoneid)
                        continue;
                (void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
        }
        mutex_exit(&ilbs->ilbs_g_lock);
}

void
ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
        ilb_rule_t *rule;
        ilb_rule_tq_t *arg;

        mutex_enter(&ilbs->ilbs_g_lock);
        while ((rule = ilbs->ilbs_rule_head) != NULL) {
                if (rule->ir_zoneid != zoneid)
                        continue;
                ilb_rule_hash_del(rule);
                ilb_rule_g_del(ilbs, rule);
                mutex_exit(&ilbs->ilbs_g_lock);

                arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
                arg->ilbs = ilbs;
                arg->rule = rule;
                (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
                    arg, TQ_SLEEP);

                mutex_enter(&ilbs->ilbs_g_lock);
        }
        mutex_exit(&ilbs->ilbs_g_lock);
}

/*
 * This is just an optimization, so don't grab the global lock.  The
 * worst case is that we missed a couple packets.
 */
boolean_t
ilb_has_rules(ilb_stack_t *ilbs)
{
        return (ilbs->ilbs_rule_head != NULL);
}


static int
ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
    ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
{
        ilb_server_t *tmp_server;
        int ret;

        ASSERT((rule == NULL && rule_name != NULL) ||
            (rule != NULL && rule_name == NULL));

        if (rule == NULL) {
                if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
                    &ret)) == NULL) {
                        return (ret);
                }
        }

        /* Once we get a hold on the rule, no server can be added/deleted. */
        for (tmp_server = rule->ir_servers; tmp_server != NULL;
            tmp_server = tmp_server->iser_next) {
                if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
                        break;
        }
        if (tmp_server == NULL) {
                ret = ENOENT;
                goto done;
        }

        if (enable) {
                ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
                    rule->ir_alg->ilb_alg_data);
                if (ret == 0) {
                        tmp_server->iser_enabled = B_TRUE;
                        tmp_server->iser_die_time = 0;
                }
        } else {
                ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
                    rule->ir_alg->ilb_alg_data);
                if (ret == 0) {
                        tmp_server->iser_enabled = B_FALSE;
                        if (rule->ir_conn_drain_timeout != 0) {
                                (void) atomic_swap_64(
                                    (uint64_t *)&tmp_server->iser_die_time,
                                    ddi_get_lbolt64() + SEC_TO_TICK(
                                    rule->ir_conn_drain_timeout));
                        }
                }
        }

done:
        if (rule_name != NULL)
                ILB_RULE_REFRELE(rule);
        return (ret);
}
int
ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
    ilb_rule_t *rule, in6_addr_t *addr)
{
        return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
}

int
ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
    ilb_rule_t *rule, in6_addr_t *addr)
{
        return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
}

/*
 * Add a back end server to a rule.  If the address is IPv4, it is assumed
 * to be passed in as a mapped address.
 */
int
ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
{
        ilb_server_t    *server;
        netstackid_t    stackid;
        int             ret = 0;
        in_port_t       min_port, max_port;
        in_port_t       range;

        /* Port is passed in network byte order. */
        min_port = ntohs(info->min_port);
        max_port = ntohs(info->max_port);
        if (min_port > max_port)
                return (EINVAL);

        /* min_port == 0 means "all ports". Make it so */
        if (min_port == 0) {
                min_port = 1;
                max_port = 65535;
        }
        range = max_port - min_port;

        mutex_enter(&rule->ir_lock);
        /* If someone is already doing server add/del, sleeps and wait. */
        while (rule->ir_flags & ILB_RULE_BUSY) {
                if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
                        mutex_exit(&rule->ir_lock);
                        return (EINTR);
                }
        }

        /*
         * Set the rule to be busy to make sure that no new packet can
         * use this rule.
         */
        rule->ir_flags |= ILB_RULE_BUSY;

        /* Now wait for all other guys to finish their work. */
        while (rule->ir_refcnt > 2) {
                if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
                        mutex_exit(&rule->ir_lock);
                        ret = EINTR;
                        goto end;
                }
        }
        mutex_exit(&rule->ir_lock);

        /* Sanity checks... */
        if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
            rule->ir_ipver != IPPROTO_IP) ||
            (!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
            rule->ir_ipver != IPPROTO_IPV6)) {
                ret = EINVAL;
                goto end;
        }

        /*
         * Check for valid port range.
         *
         * For DSR, there can be no port shifting.  Hence the server
         * specification must be the same as the rule's.
         *
         * For half-NAT/NAT, the range must either be 0 (port collapsing) or
         * it must be equal to the same value as the rule port range.
         *
         */
        if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
                if (rule->ir_max_port != max_port ||
                    rule->ir_min_port != min_port) {
                        ret = EINVAL;
                        goto end;
                }
        } else {
                if ((range != rule->ir_max_port - rule->ir_min_port) &&
                    range != 0) {
                        ret = EINVAL;
                        goto end;
                }
        }

        /* Check for duplicate. */
        for (server = rule->ir_servers; server != NULL;
            server = server->iser_next) {
                if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
                    strcasecmp(server->iser_name, info->name) == 0) {
                        break;
                }
        }
        if (server != NULL) {
                ret = EEXIST;
                goto end;
        }

        if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
                ret = ENOMEM;
                goto end;
        }

        (void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
        (void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
            sizeof (server->iser_ip_addr));
        stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
        server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
        if (server->iser_ksp == NULL) {
                kmem_free(server, sizeof (ilb_server_t));
                ret = EINVAL;
                goto end;
        }

        server->iser_stackid = stackid;
        server->iser_addr_v6 = info->addr;
        server->iser_min_port = min_port;
        server->iser_max_port = max_port;
        if (min_port != max_port)
                server->iser_port_range = B_TRUE;
        else
                server->iser_port_range = B_FALSE;

        /*
         * If the rule uses NAT, find/create the NAT source entry to use
         * for this server.
         */
        if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
                in_port_t port;

                /*
                 * If the server uses a port range, our port allocation
                 * scheme needs to treat it as a wildcard.  Refer to the
                 * comments in ilb_nat.c about the scheme.
                 */
                if (server->iser_port_range)
                        port = 0;
                else
                        port = server->iser_min_port;

                if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
                    &server->iser_addr_v6, port, &rule->ir_nat_src_start,
                    num_nat_src_v6(&rule->ir_nat_src_start,
                    &rule->ir_nat_src_end))) != 0) {
                        kstat_delete_netstack(server->iser_ksp, stackid);
                        kmem_free(server, sizeof (ilb_server_t));
                        goto end;
                }
        }

        /*
         * The iser_lock is only used to protect iser_refcnt.  All the other
         * fields in ilb_server_t should not change, except for iser_enabled.
         * The worst thing that can happen if iser_enabled is messed up is
         * that one or two packets may not be load balanced to a server
         * correctly.
         */
        server->iser_refcnt = 1;
        server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
            B_FALSE;
        mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);

        /* Let the load balancing algorithm know about the addition. */
        ASSERT(rule->ir_alg != NULL);
        if ((ret = rule->ir_alg->ilb_alg_server_add(server,
            rule->ir_alg->ilb_alg_data)) != 0) {
                kstat_delete_netstack(server->iser_ksp, stackid);
                kmem_free(server, sizeof (ilb_server_t));
                goto end;
        }

        /*
         * No need to hold ir_lock since no other thread should manipulate
         * the following fields until ILB_RULE_BUSY is cleared.
         */
        if (rule->ir_servers == NULL) {
                server->iser_next = NULL;
        } else {
                server->iser_next = rule->ir_servers;
        }
        rule->ir_servers = server;
        ILB_R_KSTAT(rule, num_servers);

end:
        mutex_enter(&rule->ir_lock);
        rule->ir_flags &= ~ILB_RULE_BUSY;
        cv_signal(&rule->ir_cv);
        mutex_exit(&rule->ir_lock);
        return (ret);
}

/* The routine executed by the delayed rule processing taskq. */
static void
ilb_server_del_tq(void *arg)
{
        ilb_server_t *server = (ilb_server_t *)arg;

        mutex_enter(&server->iser_lock);
        while (server->iser_refcnt > 1)
                cv_wait(&server->iser_cv, &server->iser_lock);
        kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
        kmem_free(server, sizeof (ilb_server_t));
}

/*
 * Delete a back end server from a rule.  If the address is IPv4, it is assumed
 * to be passed in as a mapped address.
 */
int
ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
    ilb_rule_t *rule, in6_addr_t *addr)
{
        ilb_server_t    *server;
        ilb_server_t    *prev_server;
        int             ret = 0;

        ASSERT((rule == NULL && rule_name != NULL) ||
            (rule != NULL && rule_name == NULL));
        if (rule == NULL) {
                if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
                    &ret)) == NULL) {
                        return (ret);
                }
        }

        mutex_enter(&rule->ir_lock);
        /* If someone is already doing server add/del, sleeps and wait. */
        while (rule->ir_flags & ILB_RULE_BUSY) {
                if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
                        if (rule_name != NULL) {
                                if (--rule->ir_refcnt <= 2)
                                        cv_signal(&rule->ir_cv);
                        }
                        mutex_exit(&rule->ir_lock);
                        return (EINTR);
                }
        }
        /*
         * Set the rule to be busy to make sure that no new packet can
         * use this rule.
         */
        rule->ir_flags |= ILB_RULE_BUSY;

        /* Now wait for all other guys to finish their work. */
        while (rule->ir_refcnt > 2) {
                if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
                        mutex_exit(&rule->ir_lock);
                        ret = EINTR;
                        goto end;
                }
        }
        mutex_exit(&rule->ir_lock);

        prev_server = NULL;
        for (server = rule->ir_servers; server != NULL;
            prev_server = server, server = server->iser_next) {
                if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
                        break;
        }
        if (server == NULL) {
                ret = ENOENT;
                goto end;
        }

        /*
         * Let the load balancing algorithm know about the removal.
         * The algorithm may disallow the removal...
         */
        if ((ret = rule->ir_alg->ilb_alg_server_del(server,
            rule->ir_alg->ilb_alg_data)) != 0) {
                goto end;
        }

        if (prev_server == NULL)
                rule->ir_servers = server->iser_next;
        else
                prev_server->iser_next = server->iser_next;

        ILB_R_KSTAT_UPDATE(rule, num_servers, -1);

        /*
         * Mark the server as disabled so that if there is any sticky cache
         * using this server around, it won't be used.
         */
        server->iser_enabled = B_FALSE;

        mutex_enter(&server->iser_lock);

        /*
         * De-allocate the NAT source array.  The indiviual ilb_nat_src_entry_t
         * may not go away if there is still a conn using it.  The NAT source
         * timer will do the garbage collection.
         */
        ilb_destroy_nat_src(&server->iser_nat_src);

        /* If there is a hard limit on when a server should die, set it. */
        if (rule->ir_conn_drain_timeout != 0) {
                (void) atomic_swap_64((uint64_t *)&server->iser_die_time,
                    ddi_get_lbolt64() +
                    SEC_TO_TICK(rule->ir_conn_drain_timeout));
        }

        if (server->iser_refcnt > 1) {
                (void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
                    server, TQ_SLEEP);
                mutex_exit(&server->iser_lock);
        } else {
                kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
                kmem_free(server, sizeof (ilb_server_t));
        }

end:
        mutex_enter(&rule->ir_lock);
        rule->ir_flags &= ~ILB_RULE_BUSY;
        if (rule_name != NULL)
                rule->ir_refcnt--;
        cv_signal(&rule->ir_cv);
        mutex_exit(&rule->ir_lock);
        return (ret);
}

/*
 * First check if the destination of the ICMP message matches a VIP of
 * a rule.  If it does not, just return ILB_PASSED.
 *
 * If the destination matches a VIP:
 *
 * For ICMP_ECHO_REQUEST, generate a response on behalf of the back end
 * server.
 *
 * For ICMP_DEST_UNREACHABLE fragmentation needed, check inside the payload
 * and see which back end server we should send this message to.  And we
 * need to do NAT on both the payload message and the outside IP packet.
 *
 * For other ICMP messages, drop them.
 */
/* ARGSUSED */
static int
ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
    icmph_t *icmph, ipaddr_t *lb_dst)
{
        ipaddr_t vip;
        ilb_rule_t *rule;
        in6_addr_t addr6;

        if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
                return (ILB_PASSED);


        if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
                ILB_R_KSTAT(rule, icmp_dropped);
                ILB_RULE_REFRELE(rule);
                return (ILB_DROPPED);
        }

        switch (icmph->icmph_type) {
        case ICMP_ECHO_REQUEST:
                ILB_R_KSTAT(rule, icmp_echo_processed);
                ILB_RULE_REFRELE(rule);

                icmph->icmph_type = ICMP_ECHO_REPLY;
                icmph->icmph_checksum = 0;
                icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
                ipha->ipha_ttl =
                    ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
                *lb_dst = ipha->ipha_src;
                vip = ipha->ipha_dst;
                ipha->ipha_dst = ipha->ipha_src;
                ipha->ipha_src = vip;
                return (ILB_BALANCED);
        case ICMP_DEST_UNREACHABLE: {
                int ret;

                if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
                        ILB_R_KSTAT(rule, icmp_dropped);
                        ILB_RULE_REFRELE(rule);
                        return (ILB_DROPPED);
                }
                if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
                    &addr6)) {
                        ILB_R_KSTAT(rule, icmp_2big_processed);
                        ret = ILB_BALANCED;
                } else {
                        ILB_R_KSTAT(rule, icmp_2big_dropped);
                        ret = ILB_DROPPED;
                }
                ILB_RULE_REFRELE(rule);
                IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
                return (ret);
        }
        default:
                ILB_R_KSTAT(rule, icmp_dropped);
                ILB_RULE_REFRELE(rule);
                return (ILB_DROPPED);
        }
}

/* ARGSUSED */
static int
ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
    icmp6_t *icmp6, in6_addr_t *lb_dst)
{
        ilb_rule_t *rule;

        if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
                return (ILB_PASSED);

        if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
                ILB_R_KSTAT(rule, icmp_dropped);
                ILB_RULE_REFRELE(rule);
                return (ILB_DROPPED);
        }

        switch (icmp6->icmp6_type) {
        case ICMP6_ECHO_REQUEST: {
                int hdr_len;

                ILB_R_KSTAT(rule, icmp_echo_processed);
                ILB_RULE_REFRELE(rule);

                icmp6->icmp6_type = ICMP6_ECHO_REPLY;
                icmp6->icmp6_cksum = ip6h->ip6_plen;
                hdr_len = (char *)icmp6 - (char *)ip6h;
                icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
                    ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
                ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
                ip6h->ip6_hops =
                    ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
                *lb_dst = ip6h->ip6_src;
                ip6h->ip6_src = ip6h->ip6_dst;
                ip6h->ip6_dst = *lb_dst;
                return (ILB_BALANCED);
        }
        case ICMP6_PACKET_TOO_BIG: {
                int ret;

                if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
                    lb_dst)) {
                        ILB_R_KSTAT(rule, icmp_2big_processed);
                        ret = ILB_BALANCED;
                } else {
                        ILB_R_KSTAT(rule, icmp_2big_dropped);
                        ret = ILB_DROPPED;
                }
                ILB_RULE_REFRELE(rule);
                return (ret);
        }
        default:
                ILB_R_KSTAT(rule, icmp_dropped);
                ILB_RULE_REFRELE(rule);
                return (ILB_DROPPED);
        }
}

/*
 * Common routine to check an incoming packet and decide what to do with it.
 * called by ilb_check_v4|v6().
 */
static int
ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
    in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
    in6_addr_t *lb_dst)
{
        in_port_t               sport, dport;
        tcpha_t                 *tcph;
        udpha_t                 *udph;
        ilb_rule_t              *rule;
        ilb_server_t            *server;
        boolean_t               balanced;
        struct ilb_sticky_s     *s = NULL;
        int                     ret;
        uint32_t                ip_sum, tp_sum;
        ilb_nat_info_t          info;
        uint16_t                nat_src_idx;
        boolean_t               busy;

        ret = 0;

        /*
         * We don't really need to switch here since both protocols's
         * ports are at the same offset.  Just prepare for future protocol
         * specific processing.
         */
        switch (l4) {
        case IPPROTO_TCP:
                if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
                        return (ILB_DROPPED);
                tcph = (tcpha_t *)tph;
                sport = tcph->tha_lport;
                dport = tcph->tha_fport;
                break;
        case IPPROTO_UDP:
                if (tph + sizeof (udpha_t) > mp->b_wptr)
                        return (ILB_DROPPED);
                udph = (udpha_t *)tph;
                sport = udph->uha_src_port;
                dport = udph->uha_dst_port;
                break;
        default:
                return (ILB_PASSED);
        }

        /* Fast path, there is an existing conn. */
        if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
            pkt_len, lb_dst)) {
                return (ILB_BALANCED);
        }

        /*
         * If there is no existing connection for the incoming packet, check
         * to see if the packet matches a rule.  If not, just let IP decide
         * what to do with it.
         *
         * Note: a reply from back end server should not match a rule.  A
         * reply should match one existing conn.
         */
        rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
            pkt_len, &busy);
        if (rule == NULL) {
                /* If the rule is busy, just drop the packet. */
                if (busy)
                        return (ILB_DROPPED);
                else
                        return (ILB_PASSED);
        }

        /*
         * The packet matches a rule, use the rule load balance algorithm
         * to find a server.
         */
        balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
            rule->ir_alg->ilb_alg_data, &server);
        /*
         * This can only happen if there is no server in a rule or all
         * the servers are currently disabled.
         */
        if (!balanced)
                goto no_server;

        /*
         * If the rule is sticky enabled, we need to check the sticky table.
         * If there is a sticky entry for the client, use the previous server
         * instead of the one found above (note that both can be the same).
         * If there is no entry for that client, add an entry to the sticky
         * table.  Both the find and add are done in ilb_sticky_find_add()
         * to avoid checking for duplicate when adding an entry.
         */
        if (rule->ir_flags & ILB_RULE_STICKY) {
                in6_addr_t addr;

                V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
                if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
                    &s, &nat_src_idx)) == NULL) {
                        ILB_R_KSTAT(rule, nomem_pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
                        goto no_server;
                }
        }

        /*
         * We are holding a reference on the rule, so the server
         * cannot go away.
         */
        *lb_dst = server->iser_addr_v6;
        ILB_S_KSTAT(server, pkt_processed);
        ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);

        switch (rule->ir_topo) {
        case ILB_TOPO_IMPL_NAT: {
                ilb_nat_src_entry_t     *src_ent;
                uint16_t                *src_idx;

                /*
                 * We create a cache even if it is not a SYN segment.
                 * The server should return a RST.  When we see the
                 * RST, we will destroy this cache.  But by having
                 * a cache, we know how to NAT the returned RST.
                 */
                info.vip = *dst;
                info.dport = dport;
                info.src = *src;
                info.sport = sport;

                /* If stickiness is enabled, use the same source address */
                if (s != NULL)
                        src_idx = &nat_src_idx;
                else
                        src_idx = NULL;

                if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
                    &info.nat_src, &info.nat_sport, src_idx)) == NULL) {
                        if (s != NULL)
                                ilb_sticky_refrele(s);
                        ILB_R_KSTAT(rule, pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
                        ILB_R_KSTAT(rule, noport_pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
                        ret = ILB_DROPPED;
                        break;
                }
                info.src_ent = src_ent;
                info.nat_dst = server->iser_addr_v6;
                if (rule->ir_port_range && server->iser_port_range) {
                        info.nat_dport = htons(ntohs(dport) -
                            rule->ir_min_port + server->iser_min_port);
                } else {
                        info.nat_dport = htons(server->iser_min_port);
                }

                /*
                 * If ilb_conn_add() fails, it will release the reference on
                 * sticky info and de-allocate the NAT source port allocated
                 * above.
                 */
                if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
                    dport, &info, &ip_sum, &tp_sum, s) != 0) {
                        ILB_R_KSTAT(rule, pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
                        ILB_R_KSTAT(rule, nomem_pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
                        ret = ILB_DROPPED;
                        break;
                }
                ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
                ret = ILB_BALANCED;
                break;
        }
        case ILB_TOPO_IMPL_HALF_NAT:
                info.vip = *dst;
                info.nat_dst = server->iser_addr_v6;
                info.dport = dport;
                if (rule->ir_port_range && server->iser_port_range) {
                        info.nat_dport = htons(ntohs(dport) -
                            rule->ir_min_port + server->iser_min_port);
                } else {
                        info.nat_dport = htons(server->iser_min_port);
                }

                if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
                    dport, &info, &ip_sum, &tp_sum, s) != 0) {
                        ILB_R_KSTAT(rule, pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
                        ILB_R_KSTAT(rule, nomem_pkt_dropped);
                        ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
                        ret = ILB_DROPPED;
                        break;
                }
                ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);

                ret = ILB_BALANCED;
                break;
        case ILB_TOPO_IMPL_DSR:
                /*
                 * By decrementing the sticky refcnt, the period of
                 * stickiness (life time of ilb_sticky_t) will be
                 * from now to (now + default expiry time).
                 */
                if (s != NULL)
                        ilb_sticky_refrele(s);
                ret = ILB_BALANCED;
                break;
        default:
                cmn_err(CE_PANIC, "data corruption unknown topology: %p",
                    (void *) rule);
                break;
        }
        ILB_RULE_REFRELE(rule);
        return (ret);

no_server:
        /* This can only happen if there is no server available. */
        ILB_R_KSTAT(rule, pkt_dropped);
        ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
        ILB_RULE_REFRELE(rule);
        return (ILB_DROPPED);
}

int
ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
    uint8_t *tph, ipaddr_t *lb_dst)
{
        in6_addr_t v6_src, v6_dst, v6_lb_dst;
        int ret;

        ASSERT(DB_REF(mp) == 1);

        if (l4 == IPPROTO_ICMP) {
                return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
                    lb_dst));
        }

        IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
        IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
        ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
            tph, ntohs(ipha->ipha_length), &v6_lb_dst);
        if (ret == ILB_BALANCED)
                IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
        return (ret);
}

int
ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
    uint8_t *tph, in6_addr_t *lb_dst)
{
        uint32_t pkt_len;

        ASSERT(DB_REF(mp) == 1);

        if (l4 == IPPROTO_ICMPV6) {
                return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
                    lb_dst));
        }

        pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
        return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
            IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
}

void
ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
{
        ilb_rule_t *tmp_rule;

        mutex_enter(&ilbs->ilbs_g_lock);
        *num_rules = 0;
        for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
            tmp_rule = tmp_rule->ir_next) {
                if (tmp_rule->ir_zoneid == zoneid)
                        *num_rules += 1;
        }
        mutex_exit(&ilbs->ilbs_g_lock);
}

int
ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
    uint32_t *num_servers)
{
        ilb_rule_t *rule;
        int err;

        if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
                return (err);
        *num_servers = rule->ir_kstat.num_servers.value.ui64;
        ILB_RULE_REFRELE(rule);
        return (0);
}

int
ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
    ilb_server_info_t *servers, uint32_t *num_servers)
{
        ilb_rule_t *rule;
        ilb_server_t *server;
        size_t cnt;
        int err;

        if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
                return (err);
        for (server = rule->ir_servers, cnt = *num_servers;
            server != NULL && cnt > 0;
            server = server->iser_next, cnt--, servers++) {
                (void) memcpy(servers->name, server->iser_name,
                    ILB_SERVER_NAMESZ);
                servers->addr = server->iser_addr_v6;
                servers->min_port = htons(server->iser_min_port);
                servers->max_port = htons(server->iser_max_port);
                servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
                servers->err = 0;
        }
        ILB_RULE_REFRELE(rule);
        *num_servers -= cnt;

        return (0);
}

void
ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
    char *buf)
{
        ilb_rule_t *tmp_rule;
        int cnt;

        if (*num_names == 0)
                return;

        mutex_enter(&ilbs->ilbs_g_lock);
        for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
            tmp_rule = tmp_rule->ir_next) {
                if (tmp_rule->ir_zoneid != zoneid)
                        continue;

                (void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
                buf += ILB_RULE_NAMESZ;
                if (++cnt == *num_names)
                        break;
        }
        mutex_exit(&ilbs->ilbs_g_lock);
        *num_names = cnt;
}

int
ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
{
        ilb_rule_t *rule;
        int err;

        if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
                return (err);
        }

        /*
         * Except the enabled flags, none of the following will change
         * in the life time of a rule.  So we don't hold the mutex when
         * reading them.  The worst is to report a wrong enabled flags.
         */
        cmd->ip_ver = rule->ir_ipver;
        cmd->proto = rule->ir_proto;
        cmd->min_port = htons(rule->ir_min_port);
        cmd->max_port = htons(rule->ir_max_port);

        cmd->vip = rule->ir_target_v6;
        cmd->algo = rule->ir_alg_type;
        cmd->topo = rule->ir_topo;

        cmd->nat_src_start = rule->ir_nat_src_start;
        cmd->nat_src_end = rule->ir_nat_src_end;

        cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
        cmd->nat_expiry = rule->ir_nat_expiry;
        cmd->sticky_expiry = rule->ir_sticky_expiry;

        cmd->flags = 0;
        if (rule->ir_flags & ILB_RULE_ENABLED)
                cmd->flags |= ILB_RULE_ENABLED;
        if (rule->ir_flags & ILB_RULE_STICKY) {
                cmd->flags |= ILB_RULE_STICKY;
                cmd->sticky_mask = rule->ir_sticky_mask;
        }

        ILB_RULE_REFRELE(rule);
        return (0);
}

static void *
ilb_stack_init(netstackid_t stackid, netstack_t *ns)
{
        ilb_stack_t *ilbs;
        char tq_name[TASKQ_NAMELEN];

        ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
        ilbs->ilbs_netstack = ns;

        ilbs->ilbs_rule_head = NULL;
        ilbs->ilbs_g_hash = NULL;
        mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);

        ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
        if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
                kmem_free(ilbs, sizeof (ilb_stack_t));
                return (NULL);
        }

        /*
         * ilbs_conn/sticky_hash related info is initialized in
         * ilb_conn/sticky_hash_init().
         */
        ilbs->ilbs_conn_taskq = NULL;
        ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
        ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
        ilbs->ilbs_c2s_conn_hash = NULL;
        ilbs->ilbs_s2c_conn_hash = NULL;
        ilbs->ilbs_conn_timer_list = NULL;

        ilbs->ilbs_sticky_hash = NULL;
        ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
        ilbs->ilbs_sticky_timer_list = NULL;
        ilbs->ilbs_sticky_taskq = NULL;

        /* The allocation is done later when there is a rule using NAT mode. */
        ilbs->ilbs_nat_src = NULL;
        ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
        mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
        ilbs->ilbs_nat_src_tid = 0;

        /* For listing the conn hash table */
        mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
        ilbs->ilbs_conn_list_busy = B_FALSE;
        ilbs->ilbs_conn_list_cur = 0;
        ilbs->ilbs_conn_list_connp = NULL;

        /* For listing the sticky hash table */
        mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
        ilbs->ilbs_sticky_list_busy = B_FALSE;
        ilbs->ilbs_sticky_list_cur = 0;
        ilbs->ilbs_sticky_list_curp = NULL;

        (void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
            (void *)ns);
        ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
            minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);

        return (ilbs);
}

/* ARGSUSED */
static void
ilb_stack_shutdown(netstackid_t stackid, void *arg)
{
        ilb_stack_t *ilbs = (ilb_stack_t *)arg;
        ilb_rule_t *tmp_rule;

        ilb_sticky_hash_fini(ilbs);
        ilb_conn_hash_fini(ilbs);
        mutex_enter(&ilbs->ilbs_g_lock);
        while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
                ilb_rule_hash_del(tmp_rule);
                ilb_rule_g_del(ilbs, tmp_rule);
                mutex_exit(&ilbs->ilbs_g_lock);
                ilb_rule_del_common(ilbs, tmp_rule);
                mutex_enter(&ilbs->ilbs_g_lock);
        }
        mutex_exit(&ilbs->ilbs_g_lock);
        if (ilbs->ilbs_nat_src != NULL)
                ilb_nat_src_fini(ilbs);
}

static void
ilb_stack_fini(netstackid_t stackid, void * arg)
{
        ilb_stack_t *ilbs = (ilb_stack_t *)arg;

        ilb_rule_hash_fini(ilbs);
        taskq_destroy(ilbs->ilbs_rule_taskq);
        ilb_kstat_g_fini(stackid, ilbs);
        kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
        kmem_free(ilbs, sizeof (ilb_stack_t));
}

void
ilb_ddi_g_init(void)
{
        netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
            ilb_stack_fini);
}

void
ilb_ddi_g_destroy(void)
{
        netstack_unregister(NS_ILB);
        ilb_conn_cache_fini();
        ilb_sticky_cache_fini();
}