#include <sys/sysmacros.h>
#include <sys/kmem.h>
#include <sys/ksynch.h>
#include <sys/systm.h>
#include <sys/socket.h>
#include <sys/disp.h>
#include <sys/taskq.h>
#include <sys/cmn_err.h>
#include <sys/strsun.h>
#include <sys/sdt.h>
#include <sys/atomic.h>
#include <netinet/in.h>
#include <inet/ip.h>
#include <inet/ip6.h>
#include <inet/tcp.h>
#include <inet/udp_impl.h>
#include <inet/kstatcom.h>
#include <inet/ilb_ip.h>
#include "ilb_alg.h"
#include "ilb_nat.h"
#include "ilb_conn.h"
int ilb_kmem_flags = 0;
static size_t ilb_rule_hash_size = 2048;
static size_t ilb_conn_hash_size = 262144;
static size_t ilb_sticky_hash_size = 262144;
static size_t ilb_nat_src_hash_size = 97;
static uint32_t ilb_conn_tcp_expiry = 120;
static uint32_t ilb_conn_udp_expiry = 60;
static uint32_t ilb_sticky_expiry = 60;
#define ILB_RULE_HASH(addr, hash_size) \
((*((addr) + 3) * 29791 + *((addr) + 2) * 961 + *((addr) + 1) * 31 + \
*(addr)) & ((hash_size) - 1))
#define ILB_RULE_TASKQ_NUM_THR 20
typedef struct {
ilb_stack_t *ilbs;
ilb_rule_t *rule;
} ilb_rule_tq_t;
static kstat_t *ilb_kstat_g_init(netstackid_t, ilb_stack_t *);
static void ilb_kstat_g_fini(netstackid_t, ilb_stack_t *);
static kstat_t *ilb_rule_kstat_init(netstackid_t, ilb_rule_t *);
static kstat_t *ilb_server_kstat_init(netstackid_t, ilb_rule_t *,
ilb_server_t *);
static void ilb_rule_hash_init(ilb_stack_t *);
static void ilb_rule_hash_fini(ilb_stack_t *);
static void ilb_rule_hash_add(ilb_stack_t *, ilb_rule_t *, const in6_addr_t *);
static void ilb_rule_hash_del(ilb_rule_t *);
static ilb_rule_t *ilb_rule_hash(ilb_stack_t *, int, int, in6_addr_t *,
in_port_t, zoneid_t, uint32_t, boolean_t *);
static void ilb_rule_g_add(ilb_stack_t *, ilb_rule_t *);
static void ilb_rule_g_del(ilb_stack_t *, ilb_rule_t *);
static void ilb_del_rule_common(ilb_stack_t *, ilb_rule_t *);
static ilb_rule_t *ilb_find_rule_locked(ilb_stack_t *, zoneid_t, const char *,
int *);
static boolean_t ilb_match_rule(ilb_stack_t *, zoneid_t, const char *, int,
int, in_port_t, in_port_t, const in6_addr_t *);
static void ilb_server_free(ilb_server_t *);
static void *ilb_stack_init(netstackid_t, netstack_t *);
static void ilb_stack_shutdown(netstackid_t, void *);
static void ilb_stack_fini(netstackid_t, void *);
static void ilb_rule_sticky_init(ilb_rule_t *);
static void ilb_rule_sticky_fini(ilb_rule_t *);
#define IS_ADDR_UNSPEC(addr) \
(IN6_IS_ADDR_V4MAPPED(addr) ? IN6_IS_ADDR_V4MAPPED_ANY(addr) : \
IN6_IS_ADDR_UNSPECIFIED(addr))
static uint_t ilb_kstat_instance = 0;
#define ILB_G_KS_NAME "global"
#define ILB_G_KS_CNAME "kstat"
#define ILB_RULE_KS_CNAME "rulestat"
static kstat_t *
ilb_kstat_g_init(netstackid_t stackid, ilb_stack_t *ilbs)
{
kstat_t *ksp;
ilb_g_kstat_t template = {
{ "num_rules", KSTAT_DATA_UINT64, 0 },
{ "ip_frag_in", KSTAT_DATA_UINT64, 0 },
{ "ip_frag_dropped", KSTAT_DATA_UINT64, 0 }
};
ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, 0, ILB_G_KS_NAME,
ILB_G_KS_CNAME, KSTAT_TYPE_NAMED, NUM_OF_FIELDS(ilb_g_kstat_t),
KSTAT_FLAG_VIRTUAL, stackid);
if (ksp == NULL)
return (NULL);
bcopy(&template, ilbs->ilbs_kstat, sizeof (template));
ksp->ks_data = ilbs->ilbs_kstat;
ksp->ks_private = (void *)(uintptr_t)stackid;
kstat_install(ksp);
return (ksp);
}
static void
ilb_kstat_g_fini(netstackid_t stackid, ilb_stack_t *ilbs)
{
if (ilbs->ilbs_ksp != NULL) {
ASSERT(stackid == (netstackid_t)(uintptr_t)
ilbs->ilbs_ksp->ks_private);
kstat_delete_netstack(ilbs->ilbs_ksp, stackid);
ilbs->ilbs_ksp = NULL;
}
}
static kstat_t *
ilb_rule_kstat_init(netstackid_t stackid, ilb_rule_t *rule)
{
kstat_t *ksp;
ilb_rule_kstat_t template = {
{ "num_servers", KSTAT_DATA_UINT64, 0 },
{ "bytes_not_processed", KSTAT_DATA_UINT64, 0 },
{ "pkt_not_processed", KSTAT_DATA_UINT64, 0 },
{ "bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "nomem_bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "nomem_pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "noport_bytes_dropped", KSTAT_DATA_UINT64, 0 },
{ "noport_pkt_dropped", KSTAT_DATA_UINT64, 0 },
{ "icmp_echo_processed", KSTAT_DATA_UINT64, 0 },
{ "icmp_dropped", KSTAT_DATA_UINT64, 0 },
{ "icmp_too_big_processed", KSTAT_DATA_UINT64, 0 },
{ "icmp_too_big_dropped", KSTAT_DATA_UINT64, 0 }
};
ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
rule->ir_name, ILB_RULE_KS_CNAME, KSTAT_TYPE_NAMED,
NUM_OF_FIELDS(ilb_rule_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
if (ksp == NULL)
return (NULL);
bcopy(&template, &rule->ir_kstat, sizeof (template));
ksp->ks_data = &rule->ir_kstat;
ksp->ks_private = (void *)(uintptr_t)stackid;
kstat_install(ksp);
return (ksp);
}
static kstat_t *
ilb_server_kstat_init(netstackid_t stackid, ilb_rule_t *rule,
ilb_server_t *server)
{
kstat_t *ksp;
ilb_server_kstat_t template = {
{ "bytes_processed", KSTAT_DATA_UINT64, 0 },
{ "pkt_processed", KSTAT_DATA_UINT64, 0 },
{ "ip_address", KSTAT_DATA_STRING, 0 }
};
char cname_buf[KSTAT_STRLEN];
ASSERT(strlen(rule->ir_name) + 7 < KSTAT_STRLEN);
(void) sprintf(cname_buf, "%s-sstat", rule->ir_name);
ksp = kstat_create_netstack(ILB_KSTAT_MOD_NAME, rule->ir_ks_instance,
server->iser_name, cname_buf, KSTAT_TYPE_NAMED,
NUM_OF_FIELDS(ilb_server_kstat_t), KSTAT_FLAG_VIRTUAL, stackid);
if (ksp == NULL)
return (NULL);
bcopy(&template, &server->iser_kstat, sizeof (template));
ksp->ks_data = &server->iser_kstat;
ksp->ks_private = (void *)(uintptr_t)stackid;
kstat_named_setstr(&server->iser_kstat.ip_address,
server->iser_ip_addr);
ksp->ks_data_size += strlen(server->iser_ip_addr) + 1;
kstat_install(ksp);
return (ksp);
}
static void
ilb_rule_hash_init(ilb_stack_t *ilbs)
{
int i;
if (!ISP2(ilbs->ilbs_rule_hash_size)) {
for (i = 0; i < 31; i++) {
if (ilbs->ilbs_rule_hash_size < (1 << i))
break;
}
ilbs->ilbs_rule_hash_size = 1 << i;
}
ilbs->ilbs_g_hash = kmem_zalloc(sizeof (ilb_hash_t) *
ilbs->ilbs_rule_hash_size, KM_SLEEP);
for (i = 0; i < ilbs->ilbs_rule_hash_size; i++) {
mutex_init(&ilbs->ilbs_g_hash[i].ilb_hash_lock, NULL,
MUTEX_DEFAULT, NULL);
}
}
static void
ilb_rule_hash_fini(ilb_stack_t *ilbs)
{
if (ilbs->ilbs_g_hash == NULL)
return;
kmem_free(ilbs->ilbs_g_hash, sizeof (ilb_hash_t) *
ilbs->ilbs_rule_hash_size);
}
static void
ilb_rule_hash_add(ilb_stack_t *ilbs, ilb_rule_t *rule, const in6_addr_t *addr)
{
int i;
i = ILB_RULE_HASH((uint8_t *)&addr->s6_addr32[3],
ilbs->ilbs_rule_hash_size);
DTRACE_PROBE2(ilb__rule__hash__add, ilb_rule_t *, rule, int, i);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
rule->ir_hash_next = ilbs->ilbs_g_hash[i].ilb_hash_rule;
if (ilbs->ilbs_g_hash[i].ilb_hash_rule != NULL)
ilbs->ilbs_g_hash[i].ilb_hash_rule->ir_hash_prev = rule;
rule->ir_hash_prev = NULL;
ilbs->ilbs_g_hash[i].ilb_hash_rule = rule;
rule->ir_hash = &ilbs->ilbs_g_hash[i];
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
}
static void
ilb_rule_hash_del(ilb_rule_t *rule)
{
mutex_enter(&rule->ir_hash->ilb_hash_lock);
if (rule->ir_hash->ilb_hash_rule == rule) {
rule->ir_hash->ilb_hash_rule = rule->ir_hash_next;
if (rule->ir_hash_next != NULL)
rule->ir_hash_next->ir_hash_prev = NULL;
} else {
if (rule->ir_hash_prev != NULL)
rule->ir_hash_prev->ir_hash_next =
rule->ir_hash_next;
if (rule->ir_hash_next != NULL) {
rule->ir_hash_next->ir_hash_prev =
rule->ir_hash_prev;
}
}
mutex_exit(&rule->ir_hash->ilb_hash_lock);
rule->ir_hash_next = NULL;
rule->ir_hash_prev = NULL;
rule->ir_hash = NULL;
}
static ilb_rule_t *
ilb_rule_hash(ilb_stack_t *ilbs, int l3, int l4, in6_addr_t *addr,
in_port_t port, zoneid_t zoneid, uint32_t len, boolean_t *busy)
{
int i;
ilb_rule_t *rule;
ipaddr_t v4_addr;
*busy = B_FALSE;
IN6_V4MAPPED_TO_IPADDR(addr, v4_addr);
i = ILB_RULE_HASH((uint8_t *)&v4_addr, ilbs->ilbs_rule_hash_size);
port = ntohs(port);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
rule = rule->ir_hash_next) {
if (!rule->ir_port_range) {
if (rule->ir_min_port != port)
continue;
} else {
if (port < rule->ir_min_port ||
port > rule->ir_max_port) {
continue;
}
}
if (rule->ir_ipver != l3 || rule->ir_proto != l4 ||
rule->ir_zoneid != zoneid) {
continue;
}
if (l3 == IPPROTO_IP) {
if (rule->ir_target_v4 != INADDR_ANY &&
rule->ir_target_v4 != v4_addr) {
continue;
}
} else {
if (!IN6_IS_ADDR_UNSPECIFIED(&rule->ir_target_v6) &&
!IN6_ARE_ADDR_EQUAL(addr, &rule->ir_target_v6)) {
continue;
}
}
mutex_enter(&rule->ir_lock);
if (!(rule->ir_flags & ILB_RULE_ENABLED)) {
ILB_R_KSTAT(rule, pkt_not_processed);
ILB_R_KSTAT_UPDATE(rule, bytes_not_processed, len);
mutex_exit(&rule->ir_lock);
rule = NULL;
break;
} else if (rule->ir_flags & ILB_RULE_BUSY) {
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, len);
mutex_exit(&rule->ir_lock);
*busy = B_TRUE;
rule = NULL;
break;
} else {
rule->ir_refcnt++;
ASSERT(rule->ir_refcnt != 1);
mutex_exit(&rule->ir_lock);
break;
}
}
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
return (rule);
}
static void
ilb_rule_g_add(ilb_stack_t *ilbs, ilb_rule_t *rule)
{
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
rule->ir_next = ilbs->ilbs_rule_head;
ilbs->ilbs_rule_head = rule;
ILB_KSTAT_UPDATE(ilbs, num_rules, 1);
}
static void
ilb_rule_g_del(ilb_stack_t *ilbs, ilb_rule_t *rule)
{
ilb_rule_t *tmp_rule;
ilb_rule_t *prev_rule;
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
prev_rule = NULL;
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
prev_rule = tmp_rule, tmp_rule = tmp_rule->ir_next) {
if (tmp_rule == rule)
break;
}
if (tmp_rule == NULL) {
mutex_exit(&ilbs->ilbs_g_lock);
return;
}
if (prev_rule == NULL)
ilbs->ilbs_rule_head = tmp_rule->ir_next;
else
prev_rule->ir_next = tmp_rule->ir_next;
ILB_KSTAT_UPDATE(ilbs, num_rules, -1);
}
static int64_t
num_nat_src_v6(const in6_addr_t *a1, const in6_addr_t *a2)
{
int64_t ret;
uint32_t addr1, addr2;
addr1 = ntohl(a1->s6_addr32[3]);
addr2 = ntohl(a2->s6_addr32[3]);
if (a1->s6_addr32[0] != a2->s6_addr32[0] ||
a1->s6_addr32[1] != a2->s6_addr32[1] ||
a1->s6_addr32[2] > a2->s6_addr32[2] ||
(a1->s6_addr32[2] == a2->s6_addr32[2] && addr1 > addr2)) {
return (-1);
}
if (a1->s6_addr32[2] == a2->s6_addr32[2]) {
return (addr2 - addr1 + 1);
} else {
ret = (ntohl(a2->s6_addr32[2]) - ntohl(a1->s6_addr32[2]));
ret <<= 32;
ret = ret + addr1 - addr2;
return (ret + 1);
}
}
int
ilb_rule_add(ilb_stack_t *ilbs, zoneid_t zoneid, const ilb_rule_cmd_t *cmd)
{
ilb_rule_t *rule;
netstackid_t stackid;
int ret;
in_port_t min_port, max_port;
int64_t num_src;
if (cmd->ip_ver != IPPROTO_IP && cmd->ip_ver != IPPROTO_IPV6)
return (EINVAL);
if (cmd->proto != IPPROTO_TCP && cmd->proto != IPPROTO_UDP)
return (EINVAL);
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
if (IS_ADDR_UNSPEC(&cmd->nat_src_start) ||
IS_ADDR_UNSPEC(&cmd->nat_src_end)) {
return (EINVAL);
}
}
if ((cmd->flags & ILB_RULE_STICKY) &&
IS_ADDR_UNSPEC(&cmd->sticky_mask)) {
return (EINVAL);
}
min_port = ntohs(cmd->min_port);
max_port = ntohs(cmd->max_port);
if (min_port > max_port)
return (EINVAL);
if (min_port == 0) {
min_port = 1;
max_port = 65535;
}
if (cmd->ip_ver == IPPROTO_IP) {
in_addr_t v4_addr1, v4_addr2;
v4_addr1 = cmd->vip.s6_addr32[3];
if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
CLASSD(v4_addr1) || v4_addr1 == INADDR_BROADCAST ||
v4_addr1 == INADDR_ANY ||
!IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
return (EINVAL);
}
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
v4_addr1 = ntohl(cmd->nat_src_start.s6_addr32[3]);
v4_addr2 = ntohl(cmd->nat_src_end.s6_addr32[3]);
if ((*(uchar_t *)&v4_addr1) == IN_LOOPBACKNET ||
(*(uchar_t *)&v4_addr2) == IN_LOOPBACKNET ||
v4_addr1 == INADDR_BROADCAST ||
v4_addr2 == INADDR_BROADCAST ||
v4_addr1 == INADDR_ANY || v4_addr2 == INADDR_ANY ||
CLASSD(v4_addr1) || CLASSD(v4_addr2) ||
!IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
!IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
return (EINVAL);
}
num_src = v4_addr2 - v4_addr1 + 1;
if (v4_addr1 > v4_addr2 || num_src > ILB_MAX_NAT_SRC)
return (EINVAL);
}
} else {
if (IN6_IS_ADDR_LOOPBACK(&cmd->vip) ||
IN6_IS_ADDR_MULTICAST(&cmd->vip) ||
IN6_IS_ADDR_UNSPECIFIED(&cmd->vip) ||
IN6_IS_ADDR_V4MAPPED(&cmd->vip)) {
return (EINVAL);
}
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
if (IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_start) ||
IN6_IS_ADDR_LOOPBACK(&cmd->nat_src_end) ||
IN6_IS_ADDR_MULTICAST(&cmd->nat_src_start) ||
IN6_IS_ADDR_MULTICAST(&cmd->nat_src_end) ||
IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_start) ||
IN6_IS_ADDR_UNSPECIFIED(&cmd->nat_src_end) ||
IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_start) ||
IN6_IS_ADDR_V4MAPPED(&cmd->nat_src_end)) {
return (EINVAL);
}
if ((num_src = num_nat_src_v6(&cmd->nat_src_start,
&cmd->nat_src_end)) < 0 ||
num_src > ILB_MAX_NAT_SRC) {
return (EINVAL);
}
}
}
mutex_enter(&ilbs->ilbs_g_lock);
if (ilbs->ilbs_g_hash == NULL)
ilb_rule_hash_init(ilbs);
if (ilbs->ilbs_c2s_conn_hash == NULL) {
ASSERT(ilbs->ilbs_s2c_conn_hash == NULL);
ilb_conn_hash_init(ilbs);
ilb_nat_src_init(ilbs);
}
if (ilb_match_rule(ilbs, zoneid, cmd->name, cmd->ip_ver, cmd->proto,
min_port, max_port, &cmd->vip)) {
mutex_exit(&ilbs->ilbs_g_lock);
return (EEXIST);
}
rule = kmem_zalloc(sizeof (ilb_rule_t), KM_NOSLEEP);
if (rule == NULL) {
mutex_exit(&ilbs->ilbs_g_lock);
return (ENOMEM);
}
(void) memcpy(rule->ir_name, cmd->name, ILB_RULE_NAMESZ - 1);
rule->ir_ks_instance = atomic_inc_uint_nv(&ilb_kstat_instance);
stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
if ((rule->ir_ksp = ilb_rule_kstat_init(stackid, rule)) == NULL) {
ret = ENOMEM;
goto error;
}
if (cmd->topo == ILB_TOPO_IMPL_NAT) {
rule->ir_nat_src_start = cmd->nat_src_start;
rule->ir_nat_src_end = cmd->nat_src_end;
}
rule->ir_ipver = cmd->ip_ver;
rule->ir_proto = cmd->proto;
rule->ir_topo = cmd->topo;
rule->ir_min_port = min_port;
rule->ir_max_port = max_port;
if (rule->ir_min_port != rule->ir_max_port)
rule->ir_port_range = B_TRUE;
else
rule->ir_port_range = B_FALSE;
rule->ir_zoneid = zoneid;
rule->ir_target_v6 = cmd->vip;
rule->ir_servers = NULL;
rule->ir_conn_drain_timeout = cmd->conn_drain_timeout;
if (cmd->nat_expiry != 0) {
rule->ir_nat_expiry = cmd->nat_expiry;
} else {
switch (rule->ir_proto) {
case IPPROTO_TCP:
rule->ir_nat_expiry = ilb_conn_tcp_expiry;
break;
case IPPROTO_UDP:
rule->ir_nat_expiry = ilb_conn_udp_expiry;
break;
default:
cmn_err(CE_PANIC, "data corruption: wrong ir_proto: %p",
(void *)rule);
break;
}
}
if (cmd->sticky_expiry != 0)
rule->ir_sticky_expiry = cmd->sticky_expiry;
else
rule->ir_sticky_expiry = ilb_sticky_expiry;
if (cmd->flags & ILB_RULE_STICKY) {
rule->ir_flags |= ILB_RULE_STICKY;
rule->ir_sticky_mask = cmd->sticky_mask;
if (ilbs->ilbs_sticky_hash == NULL)
ilb_sticky_hash_init(ilbs);
}
if (cmd->flags & ILB_RULE_ENABLED)
rule->ir_flags |= ILB_RULE_ENABLED;
mutex_init(&rule->ir_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&rule->ir_cv, NULL, CV_DEFAULT, NULL);
rule->ir_refcnt = 1;
switch (cmd->algo) {
case ILB_ALG_IMPL_ROUNDROBIN:
if ((rule->ir_alg = ilb_alg_rr_init(rule, NULL)) == NULL) {
ret = ENOMEM;
goto error;
}
rule->ir_alg_type = ILB_ALG_IMPL_ROUNDROBIN;
break;
case ILB_ALG_IMPL_HASH_IP:
case ILB_ALG_IMPL_HASH_IP_SPORT:
case ILB_ALG_IMPL_HASH_IP_VIP:
if ((rule->ir_alg = ilb_alg_hash_init(rule,
&cmd->algo)) == NULL) {
ret = ENOMEM;
goto error;
}
rule->ir_alg_type = cmd->algo;
break;
default:
ret = EINVAL;
goto error;
}
ilb_rule_g_add(ilbs, rule);
ilb_rule_hash_add(ilbs, rule, &cmd->vip);
mutex_exit(&ilbs->ilbs_g_lock);
return (0);
error:
mutex_exit(&ilbs->ilbs_g_lock);
if (rule->ir_ksp != NULL) {
kstat_delete_netstack(rule->ir_ksp, stackid);
}
kmem_free(rule, sizeof (ilb_rule_t));
return (ret);
}
static void
ilb_rule_del_common(ilb_stack_t *ilbs, ilb_rule_t *tmp_rule)
{
netstackid_t stackid;
ilb_server_t *server;
stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
tmp_rule->ir_alg->ilb_alg_fini(&tmp_rule->ir_alg);
while ((server = tmp_rule->ir_servers) != NULL) {
mutex_enter(&server->iser_lock);
ilb_destroy_nat_src(&server->iser_nat_src);
if (tmp_rule->ir_conn_drain_timeout != 0) {
(void) atomic_swap_64(
(uint64_t *)&server->iser_die_time,
ddi_get_lbolt64() +
SEC_TO_TICK(tmp_rule->ir_conn_drain_timeout));
}
while (server->iser_refcnt > 1)
cv_wait(&server->iser_cv, &server->iser_lock);
tmp_rule->ir_servers = server->iser_next;
kstat_delete_netstack(server->iser_ksp, stackid);
kmem_free(server, sizeof (ilb_server_t));
}
ASSERT(tmp_rule->ir_ksp != NULL);
kstat_delete_netstack(tmp_rule->ir_ksp, stackid);
kmem_free(tmp_rule, sizeof (ilb_rule_t));
}
static void
ilb_rule_del_tq(void *arg)
{
ilb_stack_t *ilbs = ((ilb_rule_tq_t *)arg)->ilbs;
ilb_rule_t *rule = ((ilb_rule_tq_t *)arg)->rule;
mutex_enter(&rule->ir_lock);
while (rule->ir_refcnt > 1)
cv_wait(&rule->ir_cv, &rule->ir_lock);
ilb_rule_del_common(ilbs, rule);
kmem_free(arg, sizeof (ilb_rule_tq_t));
}
int
ilb_rule_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name)
{
ilb_rule_t *tmp_rule;
ilb_rule_tq_t *arg;
int err;
mutex_enter(&ilbs->ilbs_g_lock);
if ((tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name,
&err)) == NULL) {
mutex_exit(&ilbs->ilbs_g_lock);
return (err);
}
ilb_rule_hash_del(tmp_rule);
ilb_rule_g_del(ilbs, tmp_rule);
mutex_exit(&ilbs->ilbs_g_lock);
ILB_RULE_REFRELE(tmp_rule);
arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
arg->ilbs = ilbs;
arg->rule = tmp_rule;
(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq, arg,
TQ_SLEEP);
return (0);
}
boolean_t
ilb_rule_match_vip_v6(ilb_stack_t *ilbs, in6_addr_t *vip, ilb_rule_t **ret_rule)
{
int i;
ilb_rule_t *rule;
boolean_t ret = B_FALSE;
i = ILB_RULE_HASH((uint8_t *)&vip->s6_addr32[3],
ilbs->ilbs_rule_hash_size);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
rule = rule->ir_hash_next) {
if (IN6_ARE_ADDR_EQUAL(vip, &rule->ir_target_v6)) {
mutex_enter(&rule->ir_lock);
if (rule->ir_flags & ILB_RULE_BUSY) {
mutex_exit(&rule->ir_lock);
break;
}
if (ret_rule != NULL) {
rule->ir_refcnt++;
mutex_exit(&rule->ir_lock);
*ret_rule = rule;
} else {
mutex_exit(&rule->ir_lock);
}
ret = B_TRUE;
break;
}
}
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
return (ret);
}
boolean_t
ilb_rule_match_vip_v4(ilb_stack_t *ilbs, ipaddr_t addr, ilb_rule_t **ret_rule)
{
int i;
ilb_rule_t *rule;
boolean_t ret = B_FALSE;
i = ILB_RULE_HASH((uint8_t *)&addr, ilbs->ilbs_rule_hash_size);
mutex_enter(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
for (rule = ilbs->ilbs_g_hash[i].ilb_hash_rule; rule != NULL;
rule = rule->ir_hash_next) {
if (rule->ir_target_v6.s6_addr32[3] == addr) {
mutex_enter(&rule->ir_lock);
if (rule->ir_flags & ILB_RULE_BUSY) {
mutex_exit(&rule->ir_lock);
break;
}
if (ret_rule != NULL) {
rule->ir_refcnt++;
mutex_exit(&rule->ir_lock);
*ret_rule = rule;
} else {
mutex_exit(&rule->ir_lock);
}
ret = B_TRUE;
break;
}
}
mutex_exit(&ilbs->ilbs_g_hash[i].ilb_hash_lock);
return (ret);
}
static ilb_rule_t *
ilb_find_rule_locked(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
int *err)
{
ilb_rule_t *tmp_rule;
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid != zoneid)
continue;
if (strcasecmp(tmp_rule->ir_name, name) == 0) {
mutex_enter(&tmp_rule->ir_lock);
if (tmp_rule->ir_flags & ILB_RULE_BUSY) {
mutex_exit(&tmp_rule->ir_lock);
*err = EINPROGRESS;
return (NULL);
}
tmp_rule->ir_refcnt++;
mutex_exit(&tmp_rule->ir_lock);
*err = 0;
return (tmp_rule);
}
}
*err = ENOENT;
return (NULL);
}
ilb_rule_t *
ilb_find_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
int *err)
{
ilb_rule_t *tmp_rule;
mutex_enter(&ilbs->ilbs_g_lock);
tmp_rule = ilb_find_rule_locked(ilbs, zoneid, name, err);
mutex_exit(&ilbs->ilbs_g_lock);
return (tmp_rule);
}
static boolean_t
ilb_match_rule(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name, int l3,
int l4, in_port_t min_port, in_port_t max_port, const in6_addr_t *addr)
{
ilb_rule_t *tmp_rule;
ASSERT(mutex_owned(&ilbs->ilbs_g_lock));
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid != zoneid)
continue;
if (strcasecmp(tmp_rule->ir_name, name) == 0)
return (B_TRUE);
if (tmp_rule->ir_ipver != l3 || tmp_rule->ir_proto != l4)
continue;
if (tmp_rule->ir_max_port < min_port ||
tmp_rule->ir_min_port > max_port) {
continue;
}
if (V6_OR_V4_INADDR_ANY(*addr) ||
V6_OR_V4_INADDR_ANY(tmp_rule->ir_target_v6) ||
IN6_ARE_ADDR_EQUAL(addr, &tmp_rule->ir_target_v6)) {
return (B_TRUE);
}
}
return (B_FALSE);
}
int
ilb_rule_enable(ilb_stack_t *ilbs, zoneid_t zoneid,
const char *rule_name, ilb_rule_t *in_rule)
{
ilb_rule_t *rule;
int err;
ASSERT((in_rule == NULL && rule_name != NULL) ||
(in_rule != NULL && rule_name == NULL));
if ((rule = in_rule) == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&err)) == NULL) {
return (err);
}
}
mutex_enter(&rule->ir_lock);
rule->ir_flags |= ILB_RULE_ENABLED;
mutex_exit(&rule->ir_lock);
if (in_rule == NULL)
ILB_RULE_REFRELE(rule);
return (0);
}
int
ilb_rule_disable(ilb_stack_t *ilbs, zoneid_t zoneid,
const char *rule_name, ilb_rule_t *in_rule)
{
ilb_rule_t *rule;
int err;
ASSERT((in_rule == NULL && rule_name != NULL) ||
(in_rule != NULL && rule_name == NULL));
if ((rule = in_rule) == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&err)) == NULL) {
return (err);
}
}
mutex_enter(&rule->ir_lock);
rule->ir_flags &= ~ILB_RULE_ENABLED;
mutex_exit(&rule->ir_lock);
if (in_rule == NULL)
ILB_RULE_REFRELE(rule);
return (0);
}
void
ilb_rule_enable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
ilb_rule_t *rule;
mutex_enter(&ilbs->ilbs_g_lock);
for (rule = ilbs->ilbs_rule_head; rule != NULL; rule = rule->ir_next) {
if (rule->ir_zoneid != zoneid)
continue;
(void) ilb_rule_enable(ilbs, zoneid, NULL, rule);
}
mutex_exit(&ilbs->ilbs_g_lock);
}
void
ilb_rule_disable_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
ilb_rule_t *rule;
mutex_enter(&ilbs->ilbs_g_lock);
for (rule = ilbs->ilbs_rule_head; rule != NULL;
rule = rule->ir_next) {
if (rule->ir_zoneid != zoneid)
continue;
(void) ilb_rule_disable(ilbs, zoneid, NULL, rule);
}
mutex_exit(&ilbs->ilbs_g_lock);
}
void
ilb_rule_del_all(ilb_stack_t *ilbs, zoneid_t zoneid)
{
ilb_rule_t *rule;
ilb_rule_tq_t *arg;
mutex_enter(&ilbs->ilbs_g_lock);
while ((rule = ilbs->ilbs_rule_head) != NULL) {
if (rule->ir_zoneid != zoneid)
continue;
ilb_rule_hash_del(rule);
ilb_rule_g_del(ilbs, rule);
mutex_exit(&ilbs->ilbs_g_lock);
arg = kmem_alloc(sizeof (ilb_rule_tq_t), KM_SLEEP);
arg->ilbs = ilbs;
arg->rule = rule;
(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_rule_del_tq,
arg, TQ_SLEEP);
mutex_enter(&ilbs->ilbs_g_lock);
}
mutex_exit(&ilbs->ilbs_g_lock);
}
boolean_t
ilb_has_rules(ilb_stack_t *ilbs)
{
return (ilbs->ilbs_rule_head != NULL);
}
static int
ilb_server_toggle(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
ilb_rule_t *rule, in6_addr_t *addr, boolean_t enable)
{
ilb_server_t *tmp_server;
int ret;
ASSERT((rule == NULL && rule_name != NULL) ||
(rule != NULL && rule_name == NULL));
if (rule == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&ret)) == NULL) {
return (ret);
}
}
for (tmp_server = rule->ir_servers; tmp_server != NULL;
tmp_server = tmp_server->iser_next) {
if (IN6_ARE_ADDR_EQUAL(&tmp_server->iser_addr_v6, addr))
break;
}
if (tmp_server == NULL) {
ret = ENOENT;
goto done;
}
if (enable) {
ret = rule->ir_alg->ilb_alg_server_enable(tmp_server,
rule->ir_alg->ilb_alg_data);
if (ret == 0) {
tmp_server->iser_enabled = B_TRUE;
tmp_server->iser_die_time = 0;
}
} else {
ret = rule->ir_alg->ilb_alg_server_disable(tmp_server,
rule->ir_alg->ilb_alg_data);
if (ret == 0) {
tmp_server->iser_enabled = B_FALSE;
if (rule->ir_conn_drain_timeout != 0) {
(void) atomic_swap_64(
(uint64_t *)&tmp_server->iser_die_time,
ddi_get_lbolt64() + SEC_TO_TICK(
rule->ir_conn_drain_timeout));
}
}
}
done:
if (rule_name != NULL)
ILB_RULE_REFRELE(rule);
return (ret);
}
int
ilb_server_enable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
ilb_rule_t *rule, in6_addr_t *addr)
{
return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_TRUE));
}
int
ilb_server_disable(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
ilb_rule_t *rule, in6_addr_t *addr)
{
return (ilb_server_toggle(ilbs, zoneid, name, rule, addr, B_FALSE));
}
int
ilb_server_add(ilb_stack_t *ilbs, ilb_rule_t *rule, ilb_server_info_t *info)
{
ilb_server_t *server;
netstackid_t stackid;
int ret = 0;
in_port_t min_port, max_port;
in_port_t range;
min_port = ntohs(info->min_port);
max_port = ntohs(info->max_port);
if (min_port > max_port)
return (EINVAL);
if (min_port == 0) {
min_port = 1;
max_port = 65535;
}
range = max_port - min_port;
mutex_enter(&rule->ir_lock);
while (rule->ir_flags & ILB_RULE_BUSY) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
mutex_exit(&rule->ir_lock);
return (EINTR);
}
}
rule->ir_flags |= ILB_RULE_BUSY;
while (rule->ir_refcnt > 2) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
mutex_exit(&rule->ir_lock);
ret = EINTR;
goto end;
}
}
mutex_exit(&rule->ir_lock);
if ((IN6_IS_ADDR_V4MAPPED(&info->addr) &&
rule->ir_ipver != IPPROTO_IP) ||
(!IN6_IS_ADDR_V4MAPPED(&info->addr) &&
rule->ir_ipver != IPPROTO_IPV6)) {
ret = EINVAL;
goto end;
}
if (rule->ir_topo == ILB_TOPO_IMPL_DSR) {
if (rule->ir_max_port != max_port ||
rule->ir_min_port != min_port) {
ret = EINVAL;
goto end;
}
} else {
if ((range != rule->ir_max_port - rule->ir_min_port) &&
range != 0) {
ret = EINVAL;
goto end;
}
}
for (server = rule->ir_servers; server != NULL;
server = server->iser_next) {
if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, &info->addr) ||
strcasecmp(server->iser_name, info->name) == 0) {
break;
}
}
if (server != NULL) {
ret = EEXIST;
goto end;
}
if ((server = kmem_zalloc(sizeof (ilb_server_t), KM_NOSLEEP)) == NULL) {
ret = ENOMEM;
goto end;
}
(void) memcpy(server->iser_name, info->name, ILB_SERVER_NAMESZ - 1);
(void) inet_ntop(AF_INET6, &info->addr, server->iser_ip_addr,
sizeof (server->iser_ip_addr));
stackid = (netstackid_t)(uintptr_t)ilbs->ilbs_ksp->ks_private;
server->iser_ksp = ilb_server_kstat_init(stackid, rule, server);
if (server->iser_ksp == NULL) {
kmem_free(server, sizeof (ilb_server_t));
ret = EINVAL;
goto end;
}
server->iser_stackid = stackid;
server->iser_addr_v6 = info->addr;
server->iser_min_port = min_port;
server->iser_max_port = max_port;
if (min_port != max_port)
server->iser_port_range = B_TRUE;
else
server->iser_port_range = B_FALSE;
if (rule->ir_topo == ILB_TOPO_IMPL_NAT) {
in_port_t port;
if (server->iser_port_range)
port = 0;
else
port = server->iser_min_port;
if ((ret = ilb_create_nat_src(ilbs, &server->iser_nat_src,
&server->iser_addr_v6, port, &rule->ir_nat_src_start,
num_nat_src_v6(&rule->ir_nat_src_start,
&rule->ir_nat_src_end))) != 0) {
kstat_delete_netstack(server->iser_ksp, stackid);
kmem_free(server, sizeof (ilb_server_t));
goto end;
}
}
server->iser_refcnt = 1;
server->iser_enabled = info->flags & ILB_SERVER_ENABLED ? B_TRUE :
B_FALSE;
mutex_init(&server->iser_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&server->iser_cv, NULL, CV_DEFAULT, NULL);
ASSERT(rule->ir_alg != NULL);
if ((ret = rule->ir_alg->ilb_alg_server_add(server,
rule->ir_alg->ilb_alg_data)) != 0) {
kstat_delete_netstack(server->iser_ksp, stackid);
kmem_free(server, sizeof (ilb_server_t));
goto end;
}
if (rule->ir_servers == NULL) {
server->iser_next = NULL;
} else {
server->iser_next = rule->ir_servers;
}
rule->ir_servers = server;
ILB_R_KSTAT(rule, num_servers);
end:
mutex_enter(&rule->ir_lock);
rule->ir_flags &= ~ILB_RULE_BUSY;
cv_signal(&rule->ir_cv);
mutex_exit(&rule->ir_lock);
return (ret);
}
static void
ilb_server_del_tq(void *arg)
{
ilb_server_t *server = (ilb_server_t *)arg;
mutex_enter(&server->iser_lock);
while (server->iser_refcnt > 1)
cv_wait(&server->iser_cv, &server->iser_lock);
kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
kmem_free(server, sizeof (ilb_server_t));
}
int
ilb_server_del(ilb_stack_t *ilbs, zoneid_t zoneid, const char *rule_name,
ilb_rule_t *rule, in6_addr_t *addr)
{
ilb_server_t *server;
ilb_server_t *prev_server;
int ret = 0;
ASSERT((rule == NULL && rule_name != NULL) ||
(rule != NULL && rule_name == NULL));
if (rule == NULL) {
if ((rule = ilb_find_rule(ilbs, zoneid, rule_name,
&ret)) == NULL) {
return (ret);
}
}
mutex_enter(&rule->ir_lock);
while (rule->ir_flags & ILB_RULE_BUSY) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
if (rule_name != NULL) {
if (--rule->ir_refcnt <= 2)
cv_signal(&rule->ir_cv);
}
mutex_exit(&rule->ir_lock);
return (EINTR);
}
}
rule->ir_flags |= ILB_RULE_BUSY;
while (rule->ir_refcnt > 2) {
if (cv_wait_sig(&rule->ir_cv, &rule->ir_lock) == 0) {
mutex_exit(&rule->ir_lock);
ret = EINTR;
goto end;
}
}
mutex_exit(&rule->ir_lock);
prev_server = NULL;
for (server = rule->ir_servers; server != NULL;
prev_server = server, server = server->iser_next) {
if (IN6_ARE_ADDR_EQUAL(&server->iser_addr_v6, addr))
break;
}
if (server == NULL) {
ret = ENOENT;
goto end;
}
if ((ret = rule->ir_alg->ilb_alg_server_del(server,
rule->ir_alg->ilb_alg_data)) != 0) {
goto end;
}
if (prev_server == NULL)
rule->ir_servers = server->iser_next;
else
prev_server->iser_next = server->iser_next;
ILB_R_KSTAT_UPDATE(rule, num_servers, -1);
server->iser_enabled = B_FALSE;
mutex_enter(&server->iser_lock);
ilb_destroy_nat_src(&server->iser_nat_src);
if (rule->ir_conn_drain_timeout != 0) {
(void) atomic_swap_64((uint64_t *)&server->iser_die_time,
ddi_get_lbolt64() +
SEC_TO_TICK(rule->ir_conn_drain_timeout));
}
if (server->iser_refcnt > 1) {
(void) taskq_dispatch(ilbs->ilbs_rule_taskq, ilb_server_del_tq,
server, TQ_SLEEP);
mutex_exit(&server->iser_lock);
} else {
kstat_delete_netstack(server->iser_ksp, server->iser_stackid);
kmem_free(server, sizeof (ilb_server_t));
}
end:
mutex_enter(&rule->ir_lock);
rule->ir_flags &= ~ILB_RULE_BUSY;
if (rule_name != NULL)
rule->ir_refcnt--;
cv_signal(&rule->ir_cv);
mutex_exit(&rule->ir_lock);
return (ret);
}
static int
ilb_icmp_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha,
icmph_t *icmph, ipaddr_t *lb_dst)
{
ipaddr_t vip;
ilb_rule_t *rule;
in6_addr_t addr6;
if (!ilb_rule_match_vip_v4(ilbs, ipha->ipha_dst, &rule))
return (ILB_PASSED);
if ((uint8_t *)icmph + sizeof (icmph_t) > mp->b_wptr) {
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
switch (icmph->icmph_type) {
case ICMP_ECHO_REQUEST:
ILB_R_KSTAT(rule, icmp_echo_processed);
ILB_RULE_REFRELE(rule);
icmph->icmph_type = ICMP_ECHO_REPLY;
icmph->icmph_checksum = 0;
icmph->icmph_checksum = IP_CSUM(mp, IPH_HDR_LENGTH(ipha), 0);
ipha->ipha_ttl =
ilbs->ilbs_netstack->netstack_ip->ips_ip_def_ttl;
*lb_dst = ipha->ipha_src;
vip = ipha->ipha_dst;
ipha->ipha_dst = ipha->ipha_src;
ipha->ipha_src = vip;
return (ILB_BALANCED);
case ICMP_DEST_UNREACHABLE: {
int ret;
if (icmph->icmph_code != ICMP_FRAGMENTATION_NEEDED) {
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IP, ipha, icmph,
&addr6)) {
ILB_R_KSTAT(rule, icmp_2big_processed);
ret = ILB_BALANCED;
} else {
ILB_R_KSTAT(rule, icmp_2big_dropped);
ret = ILB_DROPPED;
}
ILB_RULE_REFRELE(rule);
IN6_V4MAPPED_TO_IPADDR(&addr6, *lb_dst);
return (ret);
}
default:
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
}
static int
ilb_icmp_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h,
icmp6_t *icmp6, in6_addr_t *lb_dst)
{
ilb_rule_t *rule;
if (!ilb_rule_match_vip_v6(ilbs, &ip6h->ip6_dst, &rule))
return (ILB_PASSED);
if ((uint8_t *)icmp6 + sizeof (icmp6_t) > mp->b_wptr) {
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
switch (icmp6->icmp6_type) {
case ICMP6_ECHO_REQUEST: {
int hdr_len;
ILB_R_KSTAT(rule, icmp_echo_processed);
ILB_RULE_REFRELE(rule);
icmp6->icmp6_type = ICMP6_ECHO_REPLY;
icmp6->icmp6_cksum = ip6h->ip6_plen;
hdr_len = (char *)icmp6 - (char *)ip6h;
icmp6->icmp6_cksum = IP_CSUM(mp, hdr_len,
ilb_pseudo_sum_v6(ip6h, IPPROTO_ICMPV6));
ip6h->ip6_vcf &= ~IPV6_FLOWINFO_FLOWLABEL;
ip6h->ip6_hops =
ilbs->ilbs_netstack->netstack_ip->ips_ipv6_def_hops;
*lb_dst = ip6h->ip6_src;
ip6h->ip6_src = ip6h->ip6_dst;
ip6h->ip6_dst = *lb_dst;
return (ILB_BALANCED);
}
case ICMP6_PACKET_TOO_BIG: {
int ret;
if (ilb_check_icmp_conn(ilbs, mp, IPPROTO_IPV6, ip6h, icmp6,
lb_dst)) {
ILB_R_KSTAT(rule, icmp_2big_processed);
ret = ILB_BALANCED;
} else {
ILB_R_KSTAT(rule, icmp_2big_dropped);
ret = ILB_DROPPED;
}
ILB_RULE_REFRELE(rule);
return (ret);
}
default:
ILB_R_KSTAT(rule, icmp_dropped);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
}
static int
ilb_check(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, in6_addr_t *src,
in6_addr_t *dst, int l3, int l4, void *iph, uint8_t *tph, uint32_t pkt_len,
in6_addr_t *lb_dst)
{
in_port_t sport, dport;
tcpha_t *tcph;
udpha_t *udph;
ilb_rule_t *rule;
ilb_server_t *server;
boolean_t balanced;
struct ilb_sticky_s *s = NULL;
int ret;
uint32_t ip_sum, tp_sum;
ilb_nat_info_t info;
uint16_t nat_src_idx;
boolean_t busy;
ret = 0;
switch (l4) {
case IPPROTO_TCP:
if (tph + TCP_MIN_HEADER_LENGTH > mp->b_wptr)
return (ILB_DROPPED);
tcph = (tcpha_t *)tph;
sport = tcph->tha_lport;
dport = tcph->tha_fport;
break;
case IPPROTO_UDP:
if (tph + sizeof (udpha_t) > mp->b_wptr)
return (ILB_DROPPED);
udph = (udpha_t *)tph;
sport = udph->uha_src_port;
dport = udph->uha_dst_port;
break;
default:
return (ILB_PASSED);
}
if (ilb_check_conn(ilbs, l3, iph, l4, tph, src, dst, sport, dport,
pkt_len, lb_dst)) {
return (ILB_BALANCED);
}
rule = ilb_rule_hash(ilbs, l3, l4, dst, dport, ill->ill_zoneid,
pkt_len, &busy);
if (rule == NULL) {
if (busy)
return (ILB_DROPPED);
else
return (ILB_PASSED);
}
balanced = rule->ir_alg->ilb_alg_lb(src, sport, dst, dport,
rule->ir_alg->ilb_alg_data, &server);
if (!balanced)
goto no_server;
if (rule->ir_flags & ILB_RULE_STICKY) {
in6_addr_t addr;
V6_MASK_COPY(*src, rule->ir_sticky_mask, addr);
if ((server = ilb_sticky_find_add(ilbs, rule, &addr, server,
&s, &nat_src_idx)) == NULL) {
ILB_R_KSTAT(rule, nomem_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
goto no_server;
}
}
*lb_dst = server->iser_addr_v6;
ILB_S_KSTAT(server, pkt_processed);
ILB_S_KSTAT_UPDATE(server, bytes_processed, pkt_len);
switch (rule->ir_topo) {
case ILB_TOPO_IMPL_NAT: {
ilb_nat_src_entry_t *src_ent;
uint16_t *src_idx;
info.vip = *dst;
info.dport = dport;
info.src = *src;
info.sport = sport;
if (s != NULL)
src_idx = &nat_src_idx;
else
src_idx = NULL;
if ((src_ent = ilb_alloc_nat_addr(server->iser_nat_src,
&info.nat_src, &info.nat_sport, src_idx)) == NULL) {
if (s != NULL)
ilb_sticky_refrele(s);
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_R_KSTAT(rule, noport_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, noport_bytes_dropped, pkt_len);
ret = ILB_DROPPED;
break;
}
info.src_ent = src_ent;
info.nat_dst = server->iser_addr_v6;
if (rule->ir_port_range && server->iser_port_range) {
info.nat_dport = htons(ntohs(dport) -
rule->ir_min_port + server->iser_min_port);
} else {
info.nat_dport = htons(server->iser_min_port);
}
if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
dport, &info, &ip_sum, &tp_sum, s) != 0) {
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_R_KSTAT(rule, nomem_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
ret = ILB_DROPPED;
break;
}
ilb_full_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
ret = ILB_BALANCED;
break;
}
case ILB_TOPO_IMPL_HALF_NAT:
info.vip = *dst;
info.nat_dst = server->iser_addr_v6;
info.dport = dport;
if (rule->ir_port_range && server->iser_port_range) {
info.nat_dport = htons(ntohs(dport) -
rule->ir_min_port + server->iser_min_port);
} else {
info.nat_dport = htons(server->iser_min_port);
}
if (ilb_conn_add(ilbs, rule, server, src, sport, dst,
dport, &info, &ip_sum, &tp_sum, s) != 0) {
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_R_KSTAT(rule, nomem_pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, nomem_bytes_dropped, pkt_len);
ret = ILB_DROPPED;
break;
}
ilb_half_nat(l3, iph, l4, tph, &info, ip_sum, tp_sum, B_TRUE);
ret = ILB_BALANCED;
break;
case ILB_TOPO_IMPL_DSR:
if (s != NULL)
ilb_sticky_refrele(s);
ret = ILB_BALANCED;
break;
default:
cmn_err(CE_PANIC, "data corruption unknown topology: %p",
(void *) rule);
break;
}
ILB_RULE_REFRELE(rule);
return (ret);
no_server:
ILB_R_KSTAT(rule, pkt_dropped);
ILB_R_KSTAT_UPDATE(rule, bytes_dropped, pkt_len);
ILB_RULE_REFRELE(rule);
return (ILB_DROPPED);
}
int
ilb_check_v4(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ipha_t *ipha, int l4,
uint8_t *tph, ipaddr_t *lb_dst)
{
in6_addr_t v6_src, v6_dst, v6_lb_dst;
int ret;
ASSERT(DB_REF(mp) == 1);
if (l4 == IPPROTO_ICMP) {
return (ilb_icmp_v4(ilbs, ill, mp, ipha, (icmph_t *)tph,
lb_dst));
}
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6_src);
IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6_dst);
ret = ilb_check(ilbs, ill, mp, &v6_src, &v6_dst, IPPROTO_IP, l4, ipha,
tph, ntohs(ipha->ipha_length), &v6_lb_dst);
if (ret == ILB_BALANCED)
IN6_V4MAPPED_TO_IPADDR(&v6_lb_dst, *lb_dst);
return (ret);
}
int
ilb_check_v6(ilb_stack_t *ilbs, ill_t *ill, mblk_t *mp, ip6_t *ip6h, int l4,
uint8_t *tph, in6_addr_t *lb_dst)
{
uint32_t pkt_len;
ASSERT(DB_REF(mp) == 1);
if (l4 == IPPROTO_ICMPV6) {
return (ilb_icmp_v6(ilbs, ill, mp, ip6h, (icmp6_t *)tph,
lb_dst));
}
pkt_len = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
return (ilb_check(ilbs, ill, mp, &ip6h->ip6_src, &ip6h->ip6_dst,
IPPROTO_IPV6, l4, ip6h, tph, pkt_len, lb_dst));
}
void
ilb_get_num_rules(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_rules)
{
ilb_rule_t *tmp_rule;
mutex_enter(&ilbs->ilbs_g_lock);
*num_rules = 0;
for (tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid == zoneid)
*num_rules += 1;
}
mutex_exit(&ilbs->ilbs_g_lock);
}
int
ilb_get_num_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
uint32_t *num_servers)
{
ilb_rule_t *rule;
int err;
if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
return (err);
*num_servers = rule->ir_kstat.num_servers.value.ui64;
ILB_RULE_REFRELE(rule);
return (0);
}
int
ilb_get_servers(ilb_stack_t *ilbs, zoneid_t zoneid, const char *name,
ilb_server_info_t *servers, uint32_t *num_servers)
{
ilb_rule_t *rule;
ilb_server_t *server;
size_t cnt;
int err;
if ((rule = ilb_find_rule(ilbs, zoneid, name, &err)) == NULL)
return (err);
for (server = rule->ir_servers, cnt = *num_servers;
server != NULL && cnt > 0;
server = server->iser_next, cnt--, servers++) {
(void) memcpy(servers->name, server->iser_name,
ILB_SERVER_NAMESZ);
servers->addr = server->iser_addr_v6;
servers->min_port = htons(server->iser_min_port);
servers->max_port = htons(server->iser_max_port);
servers->flags = server->iser_enabled ? ILB_SERVER_ENABLED : 0;
servers->err = 0;
}
ILB_RULE_REFRELE(rule);
*num_servers -= cnt;
return (0);
}
void
ilb_get_rulenames(ilb_stack_t *ilbs, zoneid_t zoneid, uint32_t *num_names,
char *buf)
{
ilb_rule_t *tmp_rule;
int cnt;
if (*num_names == 0)
return;
mutex_enter(&ilbs->ilbs_g_lock);
for (cnt = 0, tmp_rule = ilbs->ilbs_rule_head; tmp_rule != NULL;
tmp_rule = tmp_rule->ir_next) {
if (tmp_rule->ir_zoneid != zoneid)
continue;
(void) memcpy(buf, tmp_rule->ir_name, ILB_RULE_NAMESZ);
buf += ILB_RULE_NAMESZ;
if (++cnt == *num_names)
break;
}
mutex_exit(&ilbs->ilbs_g_lock);
*num_names = cnt;
}
int
ilb_rule_list(ilb_stack_t *ilbs, zoneid_t zoneid, ilb_rule_cmd_t *cmd)
{
ilb_rule_t *rule;
int err;
if ((rule = ilb_find_rule(ilbs, zoneid, cmd->name, &err)) == NULL) {
return (err);
}
cmd->ip_ver = rule->ir_ipver;
cmd->proto = rule->ir_proto;
cmd->min_port = htons(rule->ir_min_port);
cmd->max_port = htons(rule->ir_max_port);
cmd->vip = rule->ir_target_v6;
cmd->algo = rule->ir_alg_type;
cmd->topo = rule->ir_topo;
cmd->nat_src_start = rule->ir_nat_src_start;
cmd->nat_src_end = rule->ir_nat_src_end;
cmd->conn_drain_timeout = rule->ir_conn_drain_timeout;
cmd->nat_expiry = rule->ir_nat_expiry;
cmd->sticky_expiry = rule->ir_sticky_expiry;
cmd->flags = 0;
if (rule->ir_flags & ILB_RULE_ENABLED)
cmd->flags |= ILB_RULE_ENABLED;
if (rule->ir_flags & ILB_RULE_STICKY) {
cmd->flags |= ILB_RULE_STICKY;
cmd->sticky_mask = rule->ir_sticky_mask;
}
ILB_RULE_REFRELE(rule);
return (0);
}
static void *
ilb_stack_init(netstackid_t stackid, netstack_t *ns)
{
ilb_stack_t *ilbs;
char tq_name[TASKQ_NAMELEN];
ilbs = kmem_alloc(sizeof (ilb_stack_t), KM_SLEEP);
ilbs->ilbs_netstack = ns;
ilbs->ilbs_rule_head = NULL;
ilbs->ilbs_g_hash = NULL;
mutex_init(&ilbs->ilbs_g_lock, NULL, MUTEX_DEFAULT, NULL);
ilbs->ilbs_kstat = kmem_alloc(sizeof (ilb_g_kstat_t), KM_SLEEP);
if ((ilbs->ilbs_ksp = ilb_kstat_g_init(stackid, ilbs)) == NULL) {
kmem_free(ilbs, sizeof (ilb_stack_t));
return (NULL);
}
ilbs->ilbs_conn_taskq = NULL;
ilbs->ilbs_rule_hash_size = ilb_rule_hash_size;
ilbs->ilbs_conn_hash_size = ilb_conn_hash_size;
ilbs->ilbs_c2s_conn_hash = NULL;
ilbs->ilbs_s2c_conn_hash = NULL;
ilbs->ilbs_conn_timer_list = NULL;
ilbs->ilbs_sticky_hash = NULL;
ilbs->ilbs_sticky_hash_size = ilb_sticky_hash_size;
ilbs->ilbs_sticky_timer_list = NULL;
ilbs->ilbs_sticky_taskq = NULL;
ilbs->ilbs_nat_src = NULL;
ilbs->ilbs_nat_src_hash_size = ilb_nat_src_hash_size;
mutex_init(&ilbs->ilbs_nat_src_lock, NULL, MUTEX_DEFAULT, NULL);
ilbs->ilbs_nat_src_tid = 0;
mutex_init(&ilbs->ilbs_conn_list_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ilbs->ilbs_conn_list_cv, NULL, CV_DEFAULT, NULL);
ilbs->ilbs_conn_list_busy = B_FALSE;
ilbs->ilbs_conn_list_cur = 0;
ilbs->ilbs_conn_list_connp = NULL;
mutex_init(&ilbs->ilbs_sticky_list_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&ilbs->ilbs_sticky_list_cv, NULL, CV_DEFAULT, NULL);
ilbs->ilbs_sticky_list_busy = B_FALSE;
ilbs->ilbs_sticky_list_cur = 0;
ilbs->ilbs_sticky_list_curp = NULL;
(void) snprintf(tq_name, sizeof (tq_name), "ilb_rule_taskq_%p",
(void *)ns);
ilbs->ilbs_rule_taskq = taskq_create(tq_name, ILB_RULE_TASKQ_NUM_THR,
minclsyspri, 1, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
return (ilbs);
}
static void
ilb_stack_shutdown(netstackid_t stackid, void *arg)
{
ilb_stack_t *ilbs = (ilb_stack_t *)arg;
ilb_rule_t *tmp_rule;
ilb_sticky_hash_fini(ilbs);
ilb_conn_hash_fini(ilbs);
mutex_enter(&ilbs->ilbs_g_lock);
while ((tmp_rule = ilbs->ilbs_rule_head) != NULL) {
ilb_rule_hash_del(tmp_rule);
ilb_rule_g_del(ilbs, tmp_rule);
mutex_exit(&ilbs->ilbs_g_lock);
ilb_rule_del_common(ilbs, tmp_rule);
mutex_enter(&ilbs->ilbs_g_lock);
}
mutex_exit(&ilbs->ilbs_g_lock);
if (ilbs->ilbs_nat_src != NULL)
ilb_nat_src_fini(ilbs);
}
static void
ilb_stack_fini(netstackid_t stackid, void * arg)
{
ilb_stack_t *ilbs = (ilb_stack_t *)arg;
ilb_rule_hash_fini(ilbs);
taskq_destroy(ilbs->ilbs_rule_taskq);
ilb_kstat_g_fini(stackid, ilbs);
kmem_free(ilbs->ilbs_kstat, sizeof (ilb_g_kstat_t));
kmem_free(ilbs, sizeof (ilb_stack_t));
}
void
ilb_ddi_g_init(void)
{
netstack_register(NS_ILB, ilb_stack_init, ilb_stack_shutdown,
ilb_stack_fini);
}
void
ilb_ddi_g_destroy(void)
{
netstack_unregister(NS_ILB);
ilb_conn_cache_fini();
ilb_sticky_cache_fini();
}