root/usr/src/cmd/cmd-inet/usr.lib/in.mpathd/mpd_tables.h
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#ifndef _MPD_TABLES_H
#define _MPD_TABLES_H

#ifdef  __cplusplus
extern "C" {
#endif

/*
 * Terminology:
 *
 * phyint: A NIC eg. hme0. This is represented as 'struct phyint'
 *
 * phyint instance: A protocol instance of a phyint. Eg. the IPv4 instance of
 *      hme0 or the IPv6 instance of hme0. (struct phyint_instance)
 *
 * logint: A logical interface eg. hme0:1 (struct logint)
 *
 * phyint_group: A group of phyints i.e. physical interfaces that are
 *      (i) connected to the same level 2 topology e.g. the same ethernet
 *          switch AND
 *      (ii) share the same phyint group name.
 * Load spreading and failover occur across members of the same phyint group.
 * phyint group members must be homogeneous. i.e. if a phyint belonging to a
 * phyint group has a IPv6 protocol instance, then all members of the phyint
 * group, must have IPv6 protocol instances. (struct phyint_group)
 */

#define MAXDEFERREDRTT          1       /* Maximum number of deferred rtts */

/*
 * Status of the phyint, expressed by the return code of failure_state()
 */
#define PHYINT_OK       0               /* No failure detected */
#define PHYINT_FAILURE  1               /* NIC failure detected */
#define GROUP_FAILURE   2               /* All NICs have failed */

/*
 * Return values of phyint_inst_update_from_k()
 */
#define PI_OK                   1       /* Phyint matches in the kernel */
#define PI_DELETED              2       /* Phyint has vanished in the kernel */
#define PI_IFINDEX_CHANGED      3       /* Phyint's ifindex has changed */
#define PI_IOCTL_ERROR          4       /* Some ioctl error */
#define PI_GROUP_CHANGED        5       /* The phyint has changed group. */

#define PHYINT_FLAGS(flags)     \
        (((flags) & (IFF_STANDBY | IFF_INACTIVE | IFF_FAILED | IFF_OFFLINE | \
        IFF_RUNNING)) | (handle_link_notifications ? 0 : IFF_RUNNING))

/* A Phyint can have up to 2 instances, the IPv4 and the IPv6 instance */
#define PHYINT_INSTANCE(pi, af) \
        ((af) == AF_INET ? (pi)->pi_v4 : (pi)->pi_v6)

/*
 * A phyint instance is probe *enabled* if it has been configured with a
 * unique probe address (i.e., an IFF_NOFAILOVER address).  It is probe
 * *capable* if it is also able to send probes (i.e., has one or more
 * targets available).
 */
#define PROBE_ENABLED(pii) \
        (((pii) != NULL) && ((pii)->pii_probe_sock != -1) &&    \
        ((pii)->pii_probe_logint != NULL) &&                    \
        (((pii)->pii_probe_logint->li_dupaddr == 0)))

#define PROBE_CAPABLE(pii) \
        (PROBE_ENABLED(pii) && ((pii)->pii_ntargets != 0))

/* Subtract b from a modulo n. i.e. (a - b) mod n  */
#define MOD_SUB(a, b, n)        \
        ((((a) + (n)) - (b)) % (n))

/* Increment modulo n */
#define MOD_INCR(a, n)          \
        (((a) + 1) % (n))

/* Decrement modulo n */
#define MOD_DCR(a, n)           \
        MOD_SUB(a, 1, n)

/*
 * 'index' represents an index into the circular probe stats array of
 * size PROBE_STATS_COUNT.  0 <= index < PROBE_STATS_COUNT. This is used
 * to access members of the pii_probes[] array defined in the phyint_instance
 * structure.
 */
#define PROBE_INDEX_PREV(index) \
        MOD_DCR(index, PROBE_STATS_COUNT)

#define PROBE_INDEX_NEXT(index) \
        MOD_INCR(index, PROBE_STATS_COUNT)


/*
 * If we receive more than LINK_UP_PERMIN "link up" notifications in a minute,
 * then don't actually perform the repair operation until we've dropped back
 * below the threshold (or we have a probe address and our probes indicate
 * that the link is functioning again).  This is to prevent link flapping in
 * the case where we don't have a probe address.
 */
#define LINK_UP_PERMIN  2

#define LINK_DOWN(pi) ((pi)->pi_link_state == 0)
#define LINK_UP(pi) (!LINK_DOWN(pi))
#define FLAGS_TO_LINK_STATE(pi) (((pi)->pi_flags & IFF_RUNNING) != 0)
#define UPDATE_LINK_STATE(pi) ((pi)->pi_link_state = \
        FLAGS_TO_LINK_STATE(pi) ? 1 : 0)
#define INIT_LINK_STATE(pi) ((pi)->pi_link_state = 1)

/*
 * Phyint group states; see below for the phyint group definition.
 */
enum pg_state {
        PG_OK = 1,      /* all interfaces in the group are working */
        PG_DEGRADED,    /* some interfaces in the group are unusable */
        PG_FAILED       /* all interfaces in the group are unusable */
};

/*
 * Convenience macro to check if the whole group has failed.
 */
#define GROUP_FAILED(pg)        ((pg)->pg_state == PG_FAILED)

/*
 * A doubly linked list of all phyint groups in the system.
 * A phyint group is identified by its group name.
 */
struct phyint_group {
        char pg_name[LIFGRNAMSIZ];      /* Phyint group name */
        struct phyint *pg_phyint;       /* List of phyints in this group */
        struct phyint_group *pg_next;   /* Next phyint group */
        struct phyint_group *pg_prev;   /* Prev phyint group */
        uint64_t        pg_sig;         /* Current signature of this group */
        int             pg_probeint;    /* Interval between probes */
        int             pg_fdt;         /* Time needed to detect failure */
        enum pg_state   pg_state;       /* Current group state */
        boolean_t       pg_in_use;      /* To detect removed groups */
        struct addrlist *pg_addrs;      /* Data addresses in this group */
        boolean_t pg_failmsg_printed;   /* Group failure msg printed */
};

/*
 * Phyint states; see below for the phyint definition.
 */
enum pi_state {
        PI_INIT         = 0,    /* Phyint is being initialized */
        PI_NOTARGETS    = 1,    /* Phyint has no targets */
        PI_RUNNING      = 2,    /* Phyint is functioning */
        PI_FAILED       = 3,    /* Phyint is failed */
        PI_OFFLINE      = 4     /* Phyint is offline */
};

/*
 * Representation of a NIC or a phyint. There is a list of all known phyints.
 * There is also a list of phyints belonging to a phyint group, one list
 * per phyint group.
 */
struct phyint {
        char    pi_name[LIFNAMSIZ + 1]; /* Phyint name eg. le0 */
        struct phyint_instance *pi_v4;  /* The IPv4 instance */
        struct phyint_instance *pi_v6;  /* The IPv6 instance */
        struct phyint_group *pi_group;  /* Pointer to the group */
        struct phyint   *pi_next;       /* List of all phyints */
        struct phyint   *pi_prev;       /* List of all phyints */
        struct phyint   *pi_pgnext;     /* List of phyints in this group */
        struct phyint   *pi_pgprev;     /* List of phyints in this group */
        uint_t          pi_ifindex;     /* interface index */
        enum pi_state   pi_state;       /* State of the phyint */
        uint64_t        pi_flags;       /* Phyint flags from kernel */
        uint16_t        pi_icmpid;      /* icmp id in icmp echo request */
        uint64_t        pi_taddrthresh; /* time (in secs) to delay logging */
                                        /* about missing test addresses */
        dlpi_handle_t   pi_dh;          /* DLPI handle to underlying link */
        uint_t          pi_notes;       /* enabled DLPI notifications */
        uchar_t         pi_hwaddr[DLPI_PHYSADDR_MAX]; /* phyint's hw address */
        size_t          pi_hwaddrlen;   /* phyint's hw address length */

        /*
         * The pi_whenup array is a circular buffer of the most recent
         * times (in milliseconds since some arbitrary point of time in
         * the past) that the interface was brought up; pi_whendx identifies
         * the oldest element of the array.
         */
        uint_t          pi_whenup[LINK_UP_PERMIN];
        unsigned int    pi_whendx;

        uint_t
                pi_taddrmsg_printed : 1,        /* testaddr msg printed */
                pi_duptaddrmsg_printed : 1,     /* dup testaddr msg printed */
                pi_cfgmsg_printed : 1,  /* bad config msg printed */
                pi_lfmsg_printed : 1,   /* link-flapping msg printed */
                pi_link_state : 1,      /* interface link state */
                pi_hwaddrdup : 1;       /* disabled due to dup hw address */
};

/*
 * A doubly linked list of all phyint_instances each of which contains a
 * doubly linked list of logical interfaces and targets. For eg. if both
 * IPv4 and IPv6 are used over hme0, we have 2 phyint instances, 1 for each
 * protocol.
 */
struct phyint_instance {
        struct phyint_instance  *pii_next;      /* List of all phyint insts */
        struct phyint_instance  *pii_prev;      /* List of all phyint insts */

        struct phyint   *pii_phyint;    /* Back pointer to the phyint */
        struct target   *pii_targets;   /* List of targets on this link */
        struct logint   *pii_probe_logint; /* IFF_NOFAILOVER addr for probing */
        struct logint   *pii_logint;    /* Doubly linked list of logical ifs */

        int     pii_probe_sock;         /* Socket for ICMP Probe packets */
        int     pii_af;                 /* Address family */
        uint16_t pii_rack;              /* highest acknowledged seq number */
        uint16_t pii_snxt;              /* sequence number of next probe */
        uint_t  pii_snxt_time;          /* actual next probe time that */
                                        /* includes some randomness */

        uint_t  pii_snxt_basetime;      /* strictly periodic base probe time */
                                        /* for all periodic probes */
        uint_t  pii_fd_snxt_basetime;   /* strictly periodic base probe time */
                                        /* for failure detection probes */

        hrtime_t        pii_fd_hrtime;  /* hrtime_t before which we should */
                                        /* not send probes out this pii */

        uint64_t        pii_flags;      /* Phyint flags from kernel */

        struct probe_stats {
                uint_t          pr_id;          /* Full ID of probe */
                struct target   *pr_target;     /* Probe Target */
                uint_t          pr_time_lost;   /* Time probe declared lost */
                struct timeval  pr_tv_sent;     /* Wall time probe was sent */
                hrtime_t pr_hrtime_start;       /* hrtime probe op started */
                hrtime_t pr_hrtime_sent;        /* hrtime probe was sent */
                hrtime_t pr_hrtime_ackrecv;     /* hrtime probe ack received */
                hrtime_t pr_hrtime_ackproc;     /* hrtime probe ack processed */
                uint_t  pr_status;      /* probe status as below */
#define PR_UNUSED       0               /* Probe slot unused */
#define PR_UNACKED      1               /* Probe is unacknowledged */
#define PR_ACKED        2               /* Probe has been acknowledged */
#define PR_LOST         3               /* Probe is declared lost */
        } pii_probes[PROBE_STATS_COUNT];

        uint_t
                pii_in_use : 1,                 /* To detect removed phyints */
                pii_basetime_inited : 1,        /* probe time initialized */
                pii_targets_are_routers : 1;    /* routers or hosts ? */

        uint_t  pii_probe_next;         /* next index to use in pii_probes[] */
        struct target *pii_target_next; /* next target for probing */
        struct target *pii_rtt_target_next;
                                        /* next target for rtt probes */

        int     pii_ntargets;           /* Number of active targets */
        struct stats {                  /* Cumulative statistics */
                uint64_t        lost;           /* Number of probes lost */
                uint64_t        acked;          /* Number of probes acked */
                uint64_t        sent;           /* Number of probes sent */
                uint64_t        unknown;        /* Number of ambiguous */
                                                /* probe acks */
        } pii_cum_stats;
};

#define pii_name        pii_phyint->pi_name
#define pii_ifindex     pii_phyint->pi_ifindex
#define pii_state       pii_phyint->pi_state
#define pii_icmpid      pii_phyint->pi_icmpid

#define PR_STATUS_VALID(status)         ((status) <= PR_LOST)


/*
 * A doubly linked list of prefixes or logicals, hanging off the
 * phyint instance.
 */
struct logint {
        struct logint   *li_next;       /* Next logint of this phyint inst. */
        struct logint   *li_prev;       /* Prev logint of this phyint inst. */
        struct phyint_instance  *li_phyint_inst;
                                        /* Back pointer to phyint inst. */

        char            li_name[LIFNAMSIZ + 1]; /* name Eg. hme0:1 */
        struct in6_addr li_addr;        /* IP address */
        struct in6_addr li_dstaddr;     /* Dst IP address for pointopoint */
        struct in6_addr li_subnet;      /* prefix / subnet */
        uint_t          li_subnet_len;  /* prefix / subnet length */
        uint64_t        li_flags;       /* IFF_* flags */
        uint_t
                        li_in_use : 1,  /* flag to detect deleted logints */
                        li_dupaddr : 1; /* test address is not unique */
};


/*
 * Doubly-linked list of probe targets on a phyint instance. Probe targets are
 * usually onlink routers. If no onlink routers can be found, onlink hosts
 * are used.
 */
struct target {
        struct target   *tg_next;       /* Next target for this phyint inst. */
        struct target   *tg_prev;       /* Prev target for this phyint inst. */
        struct phyint_instance  *tg_phyint_inst;
                                        /* Back pointer to phyint instance */

        struct in6_addr tg_address;     /* Target IP address */
        int             tg_status;      /* Status of the target below */
#define TG_ACTIVE       1               /* active probe target */
#define TG_UNUSED       2               /* target not in use now */
#define TG_SLOW         3               /* rtt is high - Not in use now */
#define TG_DEAD         4               /* Target is not responding */

        hrtime_t        tg_latime;      /* Target's last active time */
        int64_t         tg_rtt_sa;      /* Scaled RTT average (in ns) */
        int64_t         tg_rtt_sd;      /* Scaled RTT deviation (in ns) */
        int             tg_crtt;        /* Conservative RTT = A + 4D (in ms) */
        uint32_t
                        tg_in_use : 1;  /* In use flag */
        int64_t         tg_deferred[MAXDEFERREDRTT + 1];
                                        /* Deferred rtt data points */
        int             tg_num_deferred;
                                        /* Number of deferred rtt data points */
};

#define TG_STATUS_VALID(status) \
        (((status) >= TG_ACTIVE) && ((status) <= TG_DEAD))

/*
 * Statistics about consecutive probe failures are passed around between
 * functions in this structure.
 */
struct probe_fail_count
{
        uint_t  pf_tff;         /* Earliest time of failure in a series */
        int     pf_nfail;       /* Number of consecutive probe failures */
        int     pf_nfail_tg;    /* Number of consecutive probe fails for */
                                /* some given target 'tg' */
};

/*
 * Statistics about consecutive probe successes is passed around between
 * functions in this structure.
 */
struct probe_success_count
{
        uint_t ps_tls;          /* Most recent time of probe success */
        boolean_t ps_tls_valid; /* is ps_tls valid */
        int     ps_nsucc;       /* Number of consecutive probe successes */
                                /* starting from the most recent */
        int     ps_nsucc_tg;    /* Number of consecutive probe successes */
                                /* for some given target 'tg' */
};

/*
 * Statistics about missed probes that were never sent.
 * Happens due to scheduling delay.
 */

struct probes_missed
{
        uint_t  pm_nprobes;     /* Cumulative number of missed probes */
        uint_t  pm_ntimes;      /* Total number of occasions */
};

typedef struct addrlist {
        struct addrlist         *al_next;               /* next address */
        char                    al_name[LIFNAMSIZ];     /* address lif name */
        uint64_t                al_flags;               /* address flags */
        struct sockaddr_storage al_addr;                /* address */
} addrlist_t;

/*
 * Globals
 */
extern addrlist_t *localaddrs;
                        /* List of all local addresses, including local zones */
extern struct phyint *phyints;          /* List of all phyints */
extern struct phyint_group *phyint_groups; /* List of all phyint groups */
extern struct phyint_group *phyint_anongroup; /* Pointer to the anon group */
extern struct phyint_instance *phyint_instances;
                                        /* List of all phyint instances */
extern struct probes_missed probes_missed;
                                        /* statistics about missed probes */

/*
 * Function prototypes
 */
extern int phyint_init(void);
extern struct phyint *phyint_lookup(const char *name);
extern struct phyint_instance *phyint_inst_lookup(int af, char *name);
extern struct phyint_instance *phyint_inst_init_from_k(int af, char *name);
extern struct phyint_instance *phyint_inst_other(struct phyint_instance *pii);
extern int phyint_inst_update_from_k(struct phyint_instance *pii);
extern void phyint_inst_delete(struct phyint_instance *pii);
extern uint_t phyint_inst_timer(struct phyint_instance *pii);
extern boolean_t phyint_inst_sockinit(struct phyint_instance *pii);

extern void phyint_changed(struct phyint *pi);
extern void phyint_chstate(struct phyint *pi, enum pi_state state);
extern void phyint_group_chstate(struct phyint_group *pg, enum pg_state state);
extern struct phyint_group *phyint_group_create(const char *pg_name);
extern struct phyint_group *phyint_group_lookup(const char *pg_name);
extern void phyint_group_insert(struct phyint_group *pg);
extern void phyint_group_delete(struct phyint_group *pg);
extern void phyint_group_refresh_state(struct phyint_group *pg);
extern void phyint_standby_refresh_inactive(struct phyint *pi);
extern void phyint_check_for_repair(struct phyint *pi);
extern void phyint_transition_to_running(struct phyint *pi);
extern void phyint_activate_another(struct phyint *pi);
extern int phyint_offline(struct phyint *pi, unsigned int);
extern int phyint_undo_offline(struct phyint *pi);
extern boolean_t phyint_is_functioning(struct phyint *pi);

extern void logint_init_from_k(struct phyint_instance *pii, char *li_name);
extern void logint_delete(struct logint *li);

extern struct target *target_lookup(struct phyint_instance *pii,
    struct in6_addr addr);
extern void target_create(struct phyint_instance *pii,
    struct in6_addr addr, boolean_t is_router);
extern void target_delete(struct target *tg);
extern struct target *target_next(struct target *tg);
extern void target_add(struct phyint_instance *pii, struct in6_addr addr,
    boolean_t is_router);

extern void in_data(struct phyint_instance *pii);
extern void in6_data(struct phyint_instance *pii);

extern void logperror_pii(struct phyint_instance *pii, const char *str);
extern void logperror_li(struct logint *li, const char *str);
extern char *pr_addr(int af, struct in6_addr addr, char *abuf, int len);
extern void addr2storage(int af, const struct in6_addr *addr,
    struct sockaddr_storage *ssp);
extern void phyint_inst_print_all(void);
extern boolean_t prefix_equal(struct in6_addr, struct in6_addr, uint_t);

extern void reset_crtt_all(struct phyint *pi);
extern int failure_state(struct phyint_instance *pii);
extern void process_link_state_changes(void);
extern void clear_pii_probe_stats(struct phyint_instance *pii);
extern void start_timer(struct phyint_instance *pii);
extern void stop_probing(struct phyint *pi);

extern boolean_t own_address(struct in6_addr addr);
extern boolean_t change_pif_flags(struct phyint *pi, uint64_t set,
    uint64_t clear);

extern void close_probe_socket(struct phyint_instance *pii, boolean_t flag);
extern int probe_state_event(struct probe_stats *, struct phyint_instance *);
extern void probe_chstate(struct probe_stats *, struct phyint_instance *, int);

extern unsigned int getgraddrinfo(const char *, struct sockaddr_storage *,
    ipmp_addrinfo_t **);
extern unsigned int getifinfo(const char *, ipmp_ifinfo_t **);
extern unsigned int getgroupinfo(const char *, ipmp_groupinfo_t **);
extern unsigned int getgrouplist(ipmp_grouplist_t **);
extern unsigned int getsnap(ipmp_snap_t **);

extern boolean_t addrlist_add(addrlist_t **, const char *, uint64_t,
    struct sockaddr_storage *);
extern void addrlist_free(addrlist_t **);

#ifdef  __cplusplus
}
#endif

#endif  /* _MPD_TABLES_H */