root/usr.sbin/ldpd/kroute.c
/*      $OpenBSD: kroute.c,v 1.72 2025/12/04 14:13:51 claudio Exp $ */

/*
 * Copyright (c) 2015, 2016 Renato Westphal <renato@openbsd.org>
 * Copyright (c) 2009 Michele Marchetto <michele@openbsd.org>
 * Copyright (c) 2004 Esben Norby <norby@openbsd.org>
 * Copyright (c) 2003, 2004 Henning Brauer <henning@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */

#include <sys/types.h>
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/sysctl.h>
#include <arpa/inet.h>
#include <net/if_dl.h>
#include <net/if_types.h>
#include <net/route.h>
#include <netmpls/mpls.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <limits.h>

#include "ldpd.h"
#include "log.h"

struct {
        uint32_t                rtseq;
        pid_t                   pid;
        int                     fib_sync;
        int                     fd;
        int                     ioctl_fd;
        struct event            ev;
        unsigned int            rdomain;
} kr_state;

struct kroute_node {
        TAILQ_ENTRY(kroute_node)         entry;
        struct kroute_priority          *kprio;         /* back pointer */
        struct kroute                    r;
};

struct kroute_priority {
        TAILQ_ENTRY(kroute_priority)     entry;
        struct kroute_prefix            *kp;            /* back pointer */
        uint8_t                          priority;
        TAILQ_HEAD(, kroute_node)        nexthops;
};

struct kroute_prefix {
        RB_ENTRY(kroute_prefix)          entry;
        int                              af;
        union ldpd_addr                  prefix;
        uint8_t                          prefixlen;
        TAILQ_HEAD(plist, kroute_priority) priorities;
};
RB_HEAD(kroute_tree, kroute_prefix);
RB_PROTOTYPE(kroute_tree, kroute_prefix, entry, kroute_compare)

struct kif_addr {
        TAILQ_ENTRY(kif_addr)    entry;
        struct kaddr             a;
};

struct kif_node {
        RB_ENTRY(kif_node)       entry;
        TAILQ_HEAD(, kif_addr)   addrs;
        struct kif               k;
        struct kpw              *kpw;
};
RB_HEAD(kif_tree, kif_node);
RB_PROTOTYPE(kif_tree, kif_node, entry, kif_compare)

static void              kr_dispatch_msg(int, short, void *);
static void              kr_redist_remove(struct kroute *);
static int               kr_redist_eval(struct kroute *);
static void              kr_redistribute(struct kroute_prefix *);
static __inline int      kroute_compare(struct kroute_prefix *,
                            struct kroute_prefix *);
static struct kroute_prefix     *kroute_find_prefix(int, union ldpd_addr *,
                            uint8_t);
static struct kroute_priority   *kroute_find_prio(struct kroute_prefix *,
                            uint8_t);
static struct kroute_node       *kroute_find_gw(struct kroute_priority *,
                                    union ldpd_addr *);
static int               kroute_insert(struct kroute *);
static int               kroute_uninstall(struct kroute_node *);
static int               kroute_remove(struct kroute *);
static void              kroute_clear(void);
static __inline int      kif_compare(struct kif_node *, struct kif_node *);
static struct kif_node  *kif_find(unsigned short);
static struct kif_node  *kif_insert(unsigned short);
static int               kif_remove(struct kif_node *, int);
static struct kif_node  *kif_update(unsigned short, int, struct if_data *,
                            struct sockaddr_dl *, int *);
static struct kroute_priority   *kroute_match(int, union ldpd_addr *);
static uint8_t           prefixlen_classful(in_addr_t);
static void              get_rtaddrs(int, struct sockaddr *,
                            struct sockaddr **);
static void              if_change(unsigned short, int, struct if_data *,
                           struct sockaddr_dl *);
static void              if_newaddr(unsigned short, struct sockaddr *,
                            struct sockaddr *, struct sockaddr *);
static void              if_deladdr(unsigned short, struct sockaddr *,
                            struct sockaddr *, struct sockaddr *);
static void              if_announce(void *);
static int               send_rtmsg(int, int, struct kroute *, int);
static int               send_rtmsg_v4(int fd, int, struct kroute *, int);
static int               send_rtmsg_v6(int fd, int, struct kroute *, int);
static int               fetchtable(void);
static int               fetchifs(void);
static int               dispatch_rtmsg(void);
static int               rtmsg_process(char *, size_t);
static int               rtmsg_process_route(struct rt_msghdr *,
                            struct sockaddr *[RTAX_MAX]);
static int               kmpw_install(const char *, struct kpw *);
static int               kmpw_uninstall(const char *);

RB_GENERATE(kroute_tree, kroute_prefix, entry, kroute_compare)
RB_GENERATE(kif_tree, kif_node, entry, kif_compare)

static struct kroute_tree        krt = RB_INITIALIZER(&krt);
static struct kif_tree           kit = RB_INITIALIZER(&kit);

int
kif_init(void)
{
        if (fetchifs() == -1)
                return (-1);

        if ((kr_state.ioctl_fd = socket(AF_INET,
            SOCK_DGRAM | SOCK_CLOEXEC | SOCK_NONBLOCK, 0)) == -1) {
                log_warn("%s: ioctl socket", __func__);
                return (-1);
        }

        return (0);
}

int
kr_init(int fs, unsigned int rdomain)
{
        int             opt = 0, rcvbuf, default_rcvbuf;
        socklen_t       optlen;
        unsigned int    rtfilter;

        kr_state.fib_sync = fs;
        kr_state.rdomain = rdomain;

        if ((kr_state.fd = socket(AF_ROUTE,
            SOCK_RAW | SOCK_CLOEXEC | SOCK_NONBLOCK, 0)) == -1) {
                log_warn("%s: socket", __func__);
                return (-1);
        }

        /* not interested in my own messages */
        if (setsockopt(kr_state.fd, SOL_SOCKET, SO_USELOOPBACK,
            &opt, sizeof(opt)) == -1)
                log_warn("%s: setsockopt(SO_USELOOPBACK)", __func__);

        /* filter out unwanted messages */
        rtfilter = ROUTE_FILTER(RTM_ADD) | ROUTE_FILTER(RTM_GET) |
            ROUTE_FILTER(RTM_CHANGE) | ROUTE_FILTER(RTM_DELETE) |
            ROUTE_FILTER(RTM_IFINFO) | ROUTE_FILTER(RTM_NEWADDR) |
            ROUTE_FILTER(RTM_DELADDR) | ROUTE_FILTER(RTM_IFANNOUNCE);

        if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_MSGFILTER,
            &rtfilter, sizeof(rtfilter)) == -1)
                log_warn("%s: setsockopt(ROUTE_MSGFILTER)", __func__);

        /* grow receive buffer, don't wanna miss messages */
        optlen = sizeof(default_rcvbuf);
        if (getsockopt(kr_state.fd, SOL_SOCKET, SO_RCVBUF,
            &default_rcvbuf, &optlen) == -1)
                log_warn("%s: getsockopt SOL_SOCKET SO_RCVBUF", __func__);
        else
                for (rcvbuf = MAX_RTSOCK_BUF;
                    rcvbuf > default_rcvbuf &&
                    setsockopt(kr_state.fd, SOL_SOCKET, SO_RCVBUF,
                    &rcvbuf, sizeof(rcvbuf)) == -1 && errno == ENOBUFS;
                    rcvbuf /= 2)
                        ;       /* nothing */

        kr_state.pid = getpid();
        kr_state.rtseq = 1;

        if (fetchtable() == -1)
                return (-1);

        event_set(&kr_state.ev, kr_state.fd, EV_READ | EV_PERSIST,
            kr_dispatch_msg, NULL);
        event_add(&kr_state.ev, NULL);

        return (0);
}

void
kif_redistribute(const char *ifname)
{
        struct kif_node         *kif;
        struct kif_addr         *ka;

        RB_FOREACH(kif, kif_tree, &kit) {
                if (kif->k.rdomain != kr_state.rdomain)
                        continue;

                if (ifname && strcmp(kif->k.ifname, ifname) != 0)
                        continue;

                TAILQ_FOREACH(ka, &kif->addrs, entry)
                        main_imsg_compose_ldpe(IMSG_NEWADDR, 0, &ka->a,
                            sizeof(ka->a));
        }
}

int
kr_change(struct kroute *kr)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;
        int                      action = RTM_ADD;

        kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
        if (kp == NULL)
                goto miss;

        kprio = kroute_find_prio(kp, kr->priority);
        if (kprio == NULL)
                goto miss;

        kn = kroute_find_gw(kprio, &kr->nexthop);
        if (kn == NULL)
                goto miss;

        if (kn->r.flags & F_LDPD_INSERTED)
                action = RTM_CHANGE;

        kn->r.local_label = kr->local_label;
        kn->r.remote_label = kr->remote_label;
        kn->r.flags = kn->r.flags | F_LDPD_INSERTED;

        /* send update */
        if (send_rtmsg(kr_state.fd, action, &kn->r, AF_MPLS) == -1)
                return (-1);

        if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
            kn->r.remote_label != NO_LABEL) {
                if (send_rtmsg(kr_state.fd, RTM_CHANGE, &kn->r, kn->r.af) == -1)
                        return (-1);
        }

        return (0);

 miss:
        log_warnx("%s: lost FEC %s/%d nexthop %s", __func__,
            log_addr(kr->af, &kr->prefix), kr->prefixlen,
            log_addr(kr->af, &kr->nexthop));
        return (-1);
}

int
kr_delete(struct kroute *kr)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;
        int                      update = 0;

        kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
        if (kp == NULL)
                return (0);
        kprio = kroute_find_prio(kp, kr->priority);
        if (kprio == NULL)
                return (0);
        kn = kroute_find_gw(kprio, &kr->nexthop);
        if (kn == NULL)
                return (0);

        if (!(kn->r.flags & F_LDPD_INSERTED))
                return (0);
        if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
            kn->r.remote_label != NO_LABEL)
                update = 1;

        /* kill MPLS LSP */
        if (send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r, AF_MPLS) == -1)
                return (-1);

        kn->r.flags &= ~F_LDPD_INSERTED;
        kn->r.local_label = NO_LABEL;
        kn->r.remote_label = NO_LABEL;

        if (update &&
            send_rtmsg(kr_state.fd, RTM_CHANGE, &kn->r, kn->r.af) == -1)
                return (-1);

        return (0);
}

void
kr_shutdown(void)
{
        kr_fib_decouple();
        kroute_clear();
        kif_clear();
}

void
kr_fib_couple(void)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;
        struct kif_node         *kif;

        if (kr_state.fib_sync == 1)     /* already coupled */
                return;

        kr_state.fib_sync = 1;

        RB_FOREACH(kp, kroute_tree, &krt) {
                kprio = TAILQ_FIRST(&kp->priorities);
                if (kprio == NULL)
                        continue;

                TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
                        if (!(kn->r.flags & F_LDPD_INSERTED))
                                continue;

                        send_rtmsg(kr_state.fd, RTM_ADD, &kn->r, AF_MPLS);

                        if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
                            kn->r.remote_label != NO_LABEL) {
                                send_rtmsg(kr_state.fd, RTM_CHANGE,
                                    &kn->r, kn->r.af);
                        }
                }
        }

        RB_FOREACH(kif, kif_tree, &kit)
                if (kif->kpw)
                        kmpw_install(kif->k.ifname, kif->kpw);

        log_info("kernel routing table coupled");
}

void
kr_fib_decouple(void)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;
        uint32_t                 rl;
        struct kif_node         *kif;

        if (kr_state.fib_sync == 0)     /* already decoupled */
                return;

        RB_FOREACH(kp, kroute_tree, &krt) {
                kprio = TAILQ_FIRST(&kp->priorities);
                if (kprio == NULL)
                        continue;

                TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
                        if (!(kn->r.flags & F_LDPD_INSERTED))
                                continue;

                        send_rtmsg(kr_state.fd, RTM_DELETE,
                            &kn->r, AF_MPLS);

                        if (ldp_addrisset(kn->r.af, &kn->r.nexthop) &&
                            kn->r.remote_label != NO_LABEL) {
                                rl = kn->r.remote_label;
                                kn->r.remote_label = NO_LABEL;
                                send_rtmsg(kr_state.fd, RTM_CHANGE,
                                    &kn->r, kn->r.af);
                                kn->r.remote_label = rl;
                        }
                }
        }

        RB_FOREACH(kif, kif_tree, &kit)
                if (kif->kpw)
                        kmpw_uninstall(kif->k.ifname);

        kr_state.fib_sync = 0;
        log_info("kernel routing table decoupled");
}

void
kr_change_egress_label(int af, int was_implicit)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;

        RB_FOREACH(kp, kroute_tree, &krt) {
                if (kp->af != af)
                        continue;

                TAILQ_FOREACH(kprio, &kp->priorities, entry) {
                        TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
                                if (kn->r.local_label > MPLS_LABEL_RESERVED_MAX)
                                        continue;

                                if (!was_implicit) {
                                        kn->r.local_label = MPLS_LABEL_IMPLNULL;
                                        continue;
                                }

                                switch (kn->r.af) {
                                case AF_INET:
                                        kn->r.local_label = MPLS_LABEL_IPV4NULL;
                                        break;
                                case AF_INET6:
                                        kn->r.local_label = MPLS_LABEL_IPV6NULL;
                                        break;
                                default:
                                        break;
                                }
                        }
                }
        }
}

static void
kr_dispatch_msg(int fd, short event, void *bula)
{
        if (dispatch_rtmsg() == -1)
                event_loopexit(NULL);
}

void
kr_show_route(struct imsg *imsg)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;
        int                      flags;
        struct kroute            kr;

        switch (imsg->hdr.type) {
        case IMSG_CTL_KROUTE:
                if (imsg->hdr.len != IMSG_HEADER_SIZE + sizeof(flags)) {
                        log_warnx("%s: wrong imsg len", __func__);
                        return;
                }
                memcpy(&flags, imsg->data, sizeof(flags));

                RB_FOREACH(kp, kroute_tree, &krt)
                        TAILQ_FOREACH(kprio, &kp->priorities, entry)
                                TAILQ_FOREACH(kn, &kprio->nexthops, entry) {
                                        if (flags && !(kn->r.flags & flags))
                                                continue;

                                        main_imsg_compose_ldpe(IMSG_CTL_KROUTE,
                                            imsg->hdr.pid, &kn->r,
                                            sizeof(kn->r));
                                }
                break;
        case IMSG_CTL_KROUTE_ADDR:
                if (imsg->hdr.len != IMSG_HEADER_SIZE + sizeof(kr)) {
                        log_warnx("%s: wrong imsg len", __func__);
                        return;
                }
                memcpy(&kr, imsg->data, sizeof(kr));

                kprio = kroute_match(kr.af, &kr.prefix);
                if (kprio == NULL)
                        break;

                TAILQ_FOREACH(kn, &kprio->nexthops, entry)
                        main_imsg_compose_ldpe(IMSG_CTL_KROUTE, imsg->hdr.pid,
                            &kn->r, sizeof(kn->r));
                break;
        default:
                log_debug("%s: error handling imsg", __func__);
                break;
        }
        main_imsg_compose_ldpe(IMSG_CTL_END, imsg->hdr.pid, NULL, 0);
}

void
kr_ifinfo(char *ifname, pid_t pid)
{
        struct kif_node *kif;

        RB_FOREACH(kif, kif_tree, &kit)
                if (ifname == NULL || !strcmp(ifname, kif->k.ifname)) {
                        main_imsg_compose_ldpe(IMSG_CTL_IFINFO,
                            pid, &kif->k, sizeof(kif->k));
                }

        main_imsg_compose_ldpe(IMSG_CTL_END, pid, NULL, 0);
}

static void
kr_redist_remove(struct kroute *kr)
{
        /* was the route redistributed? */
        if ((kr->flags & F_REDISTRIBUTED) == 0)
                return;

        /* remove redistributed flag */
        kr->flags &= ~F_REDISTRIBUTED;
        main_imsg_compose_lde(IMSG_NETWORK_DEL, 0, kr, sizeof(*kr));
}

static int
kr_redist_eval(struct kroute *kr)
{
        /* was the route redistributed? */
        if (kr->flags & F_REDISTRIBUTED)
                goto dont_redistribute;

        /* Dynamic routes are not redistributable. */
        if (kr->flags & F_DYNAMIC)
                goto dont_redistribute;

        /* filter-out non-redistributable addresses */
        if (bad_addr(kr->af, &kr->prefix) ||
            (kr->af == AF_INET6 && IN6_IS_SCOPE_EMBED(&kr->prefix.v6)))
                goto dont_redistribute;

        /* do not redistribute the default route */
        if (kr->prefixlen == 0)
                goto dont_redistribute;

        /*
         * Consider networks with nexthop loopback as not redistributable
         * unless it is a reject or blackhole route.
         */
        switch (kr->af) {
        case AF_INET:
                if (kr->nexthop.v4.s_addr == htonl(INADDR_LOOPBACK) &&
                    !(kr->flags & (F_BLACKHOLE|F_REJECT)))
                        goto dont_redistribute;
                break;
        case AF_INET6:
                if (IN6_IS_ADDR_LOOPBACK(&kr->nexthop.v6) &&
                    !(kr->flags & (F_BLACKHOLE|F_REJECT)))
                        goto dont_redistribute;
                break;
        default:
                log_debug("%s: unexpected address-family", __func__);
                break;
        }

        /* prefix should be redistributed */
        kr->flags |= F_REDISTRIBUTED;
        main_imsg_compose_lde(IMSG_NETWORK_ADD, 0, kr, sizeof(*kr));
        return (1);

 dont_redistribute:
        return (0);
}

static void
kr_redistribute(struct kroute_prefix *kp)
{
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;

        TAILQ_FOREACH_REVERSE(kprio, &kp->priorities, plist, entry) {
                if (kprio == TAILQ_FIRST(&kp->priorities)) {
                        TAILQ_FOREACH(kn, &kprio->nexthops, entry)
                                kr_redist_eval(&kn->r);
                } else {
                        TAILQ_FOREACH(kn, &kprio->nexthops, entry)
                                kr_redist_remove(&kn->r);
                }
        }
}

/* rb-tree compare */
static __inline int
kroute_compare(struct kroute_prefix *a, struct kroute_prefix *b)
{
        int              addrcmp;

        if (a->af < b->af)
                return (-1);
        if (a->af > b->af)
                return (1);

        addrcmp = ldp_addrcmp(a->af, &a->prefix, &b->prefix);
        if (addrcmp != 0)
                return (addrcmp);

        if (a->prefixlen < b->prefixlen)
                return (-1);
        if (a->prefixlen > b->prefixlen)
                return (1);

        return (0);
}

/* tree management */
static struct kroute_prefix *
kroute_find_prefix(int af, union ldpd_addr *prefix, uint8_t prefixlen)
{
        struct kroute_prefix     s;

        s.af = af;
        s.prefix = *prefix;
        s.prefixlen = prefixlen;

        return (RB_FIND(kroute_tree, &krt, &s));
}

static struct kroute_priority *
kroute_find_prio(struct kroute_prefix *kp, uint8_t prio)
{
        struct kroute_priority  *kprio;

        /* RTP_ANY here picks the lowest priority node */
        if (prio == RTP_ANY)
                return (TAILQ_FIRST(&kp->priorities));

        TAILQ_FOREACH(kprio, &kp->priorities, entry)
                if (kprio->priority == prio)
                        return (kprio);

        return (NULL);
}

static struct kroute_node *
kroute_find_gw(struct kroute_priority *kprio, union ldpd_addr *nh)
{
        struct kroute_node      *kn;

        TAILQ_FOREACH(kn, &kprio->nexthops, entry)
                if (ldp_addrcmp(kprio->kp->af, &kn->r.nexthop, nh) == 0)
                        return (kn);

        return (NULL);
}

static int
kroute_insert(struct kroute *kr)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio, *tmp;
        struct kroute_node      *kn;

        kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
        if (kp == NULL) {
                kp = calloc(1, sizeof((*kp)));
                if (kp == NULL)
                        fatal(__func__);
                kp->af = kr->af;
                kp->prefix = kr->prefix;
                kp->prefixlen = kr->prefixlen;
                TAILQ_INIT(&kp->priorities);
                RB_INSERT(kroute_tree, &krt, kp);
        }

        kprio = kroute_find_prio(kp, kr->priority);
        if (kprio == NULL) {
                kprio = calloc(1, sizeof(*kprio));
                if (kprio == NULL)
                        fatal(__func__);
                kprio->kp = kp;
                kprio->priority = kr->priority;
                TAILQ_INIT(&kprio->nexthops);

                /* lower priorities first */
                TAILQ_FOREACH(tmp, &kp->priorities, entry)
                        if (tmp->priority > kprio->priority)
                                break;
                if (tmp)
                        TAILQ_INSERT_BEFORE(tmp, kprio, entry);
                else
                        TAILQ_INSERT_TAIL(&kp->priorities, kprio, entry);
        }

        kn = kroute_find_gw(kprio, &kr->nexthop);
        if (kn == NULL) {
                kn = calloc(1, sizeof(*kn));
                if (kn == NULL)
                        fatal(__func__);
                kn->kprio = kprio;
                kn->r = *kr;
                TAILQ_INSERT_TAIL(&kprio->nexthops, kn, entry);
        }

        kr_redistribute(kp);
        return (0);
}

static int
kroute_uninstall(struct kroute_node *kn)
{
        /* kill MPLS LSP if one was installed */
        if (kn->r.flags & F_LDPD_INSERTED)
                if (send_rtmsg(kr_state.fd, RTM_DELETE, &kn->r, AF_MPLS) == -1)
                        return (-1);

        return (0);
}

static int
kroute_remove(struct kroute *kr)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;

        kp = kroute_find_prefix(kr->af, &kr->prefix, kr->prefixlen);
        if (kp == NULL)
                goto notfound;
        kprio = kroute_find_prio(kp, kr->priority);
        if (kprio == NULL)
                goto notfound;
        kn = kroute_find_gw(kprio, &kr->nexthop);
        if (kn == NULL)
                goto notfound;

        kr_redist_remove(&kn->r);
        kroute_uninstall(kn);

        TAILQ_REMOVE(&kprio->nexthops, kn, entry);
        free(kn);

        if (TAILQ_EMPTY(&kprio->nexthops)) {
                TAILQ_REMOVE(&kp->priorities, kprio, entry);
                free(kprio);
        }

        if (TAILQ_EMPTY(&kp->priorities)) {
                if (RB_REMOVE(kroute_tree, &krt, kp) == NULL) {
                        log_warnx("%s failed for %s/%u", __func__,
                            log_addr(kr->af, &kr->prefix), kp->prefixlen);
                        return (-1);
                }
                free(kp);
        } else
                kr_redistribute(kp);

        return (0);

 notfound:
        log_warnx("%s failed to find %s/%u", __func__,
            log_addr(kr->af, &kr->prefix), kr->prefixlen);
        return (-1);
}

static void
kroute_clear(void)
{
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;

        while ((kp = RB_MIN(kroute_tree, &krt)) != NULL) {
                while ((kprio = TAILQ_FIRST(&kp->priorities)) != NULL) {
                        while ((kn = TAILQ_FIRST(&kprio->nexthops)) != NULL) {
                                kroute_uninstall(kn);
                                TAILQ_REMOVE(&kprio->nexthops, kn, entry);
                                free(kn);
                        }
                        TAILQ_REMOVE(&kp->priorities, kprio, entry);
                        free(kprio);
                }
                RB_REMOVE(kroute_tree, &krt, kp);
                free(kp);
        }
}

static __inline int
kif_compare(struct kif_node *a, struct kif_node *b)
{
        return (b->k.ifindex - a->k.ifindex);
}

/* tree management */
static struct kif_node *
kif_find(unsigned short ifindex)
{
        struct kif_node s;

        memset(&s, 0, sizeof(s));
        s.k.ifindex = ifindex;

        return (RB_FIND(kif_tree, &kit, &s));
}

struct kif *
kif_findname(char *ifname)
{
        struct kif_node *kif;

        RB_FOREACH(kif, kif_tree, &kit)
                if (!strcmp(ifname, kif->k.ifname))
                        return (&kif->k);

        return (NULL);
}

static struct kif_node *
kif_insert(unsigned short ifindex)
{
        struct kif_node *kif;

        if ((kif = calloc(1, sizeof(struct kif_node))) == NULL)
                return (NULL);

        kif->k.ifindex = ifindex;
        TAILQ_INIT(&kif->addrs);

        if (RB_INSERT(kif_tree, &kit, kif) != NULL)
                fatalx("kif_insert: RB_INSERT");

        return (kif);
}

static int
kif_remove(struct kif_node *kif, int notify)
{
        struct kif_addr *ka;

        if (RB_REMOVE(kif_tree, &kit, kif) == NULL) {
                log_warnx("RB_REMOVE(kif_tree, &kit, kif)");
                return (-1);
        }

        while ((ka = TAILQ_FIRST(&kif->addrs)) != NULL) {
                if (notify)
                        main_imsg_compose_ldpe(IMSG_DELADDR, 0, &ka->a,
                            sizeof(ka->a));
                TAILQ_REMOVE(&kif->addrs, ka, entry);
                free(ka);
        }
        free(kif);
        return (0);
}

void
kif_clear(void)
{
        struct kif_node *kif;

        while ((kif = RB_MIN(kif_tree, &kit)) != NULL)
                kif_remove(kif, 0);
}

static struct kif_node *
kif_update(unsigned short ifindex, int flags, struct if_data *ifd,
    struct sockaddr_dl *sdl, int *link_old)
{
        struct kif_node         *kif;

        if ((kif = kif_find(ifindex)) == NULL) {
                if ((kif = kif_insert(ifindex)) == NULL)
                        return (NULL);
        } else
                *link_old = (kif->k.flags & IFF_UP) &&
                    LINK_STATE_IS_UP(kif->k.link_state);

        kif->k.flags = flags;
        kif->k.link_state = ifd->ifi_link_state;
        if (sdl)
                memcpy(kif->k.mac, LLADDR(sdl), sizeof(kif->k.mac));
        kif->k.if_type = ifd->ifi_type;
        kif->k.baudrate = ifd->ifi_baudrate;
        kif->k.mtu = ifd->ifi_mtu;
        kif->k.rdomain = ifd->ifi_rdomain;

        if (sdl && sdl->sdl_family == AF_LINK) {
                if (sdl->sdl_nlen >= sizeof(kif->k.ifname))
                        memcpy(kif->k.ifname, sdl->sdl_data,
                            sizeof(kif->k.ifname) - 1);
                else if (sdl->sdl_nlen > 0)
                        memcpy(kif->k.ifname, sdl->sdl_data,
                            sdl->sdl_nlen);
                /* string already terminated via calloc() */
        }

        return (kif);
}

static struct kroute_priority *
kroute_match(int af, union ldpd_addr *key)
{
        int                      i, maxprefixlen;
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        union ldpd_addr          addr;

        switch (af) {
        case AF_INET:
                maxprefixlen = 32;
                break;
        case AF_INET6:
                maxprefixlen = 128;
                break;
        default:
                log_warnx("%s: unknown af", __func__);
                return (NULL);
        }

        for (i = maxprefixlen; i >= 0; i--) {
                ldp_applymask(af, &addr, key, i);

                kp = kroute_find_prefix(af, &addr, i);
                if (kp == NULL)
                        continue;

                kprio = kroute_find_prio(kp, RTP_ANY);
                if (kprio != NULL)
                        return (kprio);
        }

        return (NULL);
}

/* misc */
static uint8_t
prefixlen_classful(in_addr_t ina)
{
        /* it hurt to write this. */

        if (ina >= 0xf0000000U)         /* class E */
                return (32);
        else if (ina >= 0xe0000000U)    /* class D */
                return (4);
        else if (ina >= 0xc0000000U)    /* class C */
                return (24);
        else if (ina >= 0x80000000U)    /* class B */
                return (16);
        else                            /* class A */
                return (8);
}

#define ROUNDUP(a) \
        ((a) > 0 ? (1 + (((a) - 1) | (sizeof(long) - 1))) : sizeof(long))

static void
get_rtaddrs(int addrs, struct sockaddr *sa, struct sockaddr **rti_info)
{
        int     i;

        for (i = 0; i < RTAX_MAX; i++) {
                if (addrs & (1 << i)) {
                        rti_info[i] = sa;
                        sa = (struct sockaddr *)((char *)(sa) +
                            ROUNDUP(sa->sa_len));
                } else
                        rti_info[i] = NULL;
        }
}

static void
if_change(unsigned short ifindex, int flags, struct if_data *ifd,
    struct sockaddr_dl *sdl)
{
        struct kif_node         *kif;
        struct kif_addr         *ka;
        int                      link_old = 0, link_new;

        kif = kif_update(ifindex, flags, ifd, sdl, &link_old);
        if (!kif) {
                log_warn("%s: kif_update(%u)", __func__, ifindex);
                return;
        }
        link_new = (kif->k.flags & IFF_UP) &&
            LINK_STATE_IS_UP(kif->k.link_state);

        if (link_new == link_old)
                return;

        main_imsg_compose_ldpe(IMSG_IFSTATUS, 0, &kif->k, sizeof(struct kif));
        if (link_new) {
                TAILQ_FOREACH(ka, &kif->addrs, entry)
                        main_imsg_compose_ldpe(IMSG_NEWADDR, 0, &ka->a,
                            sizeof(ka->a));
        } else {
                TAILQ_FOREACH(ka, &kif->addrs, entry)
                        main_imsg_compose_ldpe(IMSG_DELADDR, 0, &ka->a,
                            sizeof(ka->a));
        }
}

static void
if_newaddr(unsigned short ifindex, struct sockaddr *ifa, struct sockaddr *mask,
    struct sockaddr *brd)
{
        struct kif_node         *kif;
        struct sockaddr_in      *ifa4, *mask4, *brd4;
        struct sockaddr_in6     *ifa6, *mask6, *brd6;
        struct kif_addr         *ka;

        if (ifa == NULL)
                return;
        if ((kif = kif_find(ifindex)) == NULL) {
                log_warnx("%s: corresponding if %d not found", __func__,
                    ifindex);
                return;
        }

        switch (ifa->sa_family) {
        case AF_INET:
                ifa4 = (struct sockaddr_in *) ifa;
                mask4 = (struct sockaddr_in *) mask;
                brd4 = (struct sockaddr_in *) brd;

                /* filter out unwanted addresses */
                if (bad_addr_v4(ifa4->sin_addr))
                        return;

                if ((ka = calloc(1, sizeof(struct kif_addr))) == NULL)
                        fatal("if_newaddr");
                ka->a.addr.v4 = ifa4->sin_addr;
                if (mask4)
                        ka->a.prefixlen =
                            mask2prefixlen(mask4->sin_addr.s_addr);
                if (brd4)
                        ka->a.dstbrd.v4 = brd4->sin_addr;
                break;
        case AF_INET6:
                ifa6 = (struct sockaddr_in6 *) ifa;
                mask6 = (struct sockaddr_in6 *) mask;
                brd6 = (struct sockaddr_in6 *) brd;

                /* We only care about link-local and global-scope. */
                if (bad_addr_v6(&ifa6->sin6_addr))
                        return;

                clearscope(&ifa6->sin6_addr);

                if ((ka = calloc(1, sizeof(struct kif_addr))) == NULL)
                        fatal("if_newaddr");
                ka->a.addr.v6 = ifa6->sin6_addr;
                if (mask6)
                        ka->a.prefixlen = mask2prefixlen6(mask6);
                if (brd6)
                        ka->a.dstbrd.v6 = brd6->sin6_addr;
                break;
        default:
                return;
        }

        ka->a.ifindex = ifindex;
        ka->a.af = ifa->sa_family;
        TAILQ_INSERT_TAIL(&kif->addrs, ka, entry);

        /* notify ldpe about new address */
        main_imsg_compose_ldpe(IMSG_NEWADDR, 0, &ka->a, sizeof(ka->a));
}

static void
if_deladdr(unsigned short ifindex, struct sockaddr *ifa, struct sockaddr *mask,
    struct sockaddr *brd)
{
        struct kif_node         *kif;
        struct sockaddr_in      *ifa4, *mask4, *brd4;
        struct sockaddr_in6     *ifa6, *mask6, *brd6;
        struct kaddr             k;
        struct kif_addr         *ka, *nka;

        if (ifa == NULL)
                return;
        if ((kif = kif_find(ifindex)) == NULL) {
                log_warnx("%s: corresponding if %d not found", __func__,
                    ifindex);
                return;
        }

        memset(&k, 0, sizeof(k));
        k.af = ifa->sa_family;
        switch (ifa->sa_family) {
        case AF_INET:
                ifa4 = (struct sockaddr_in *) ifa;
                mask4 = (struct sockaddr_in *) mask;
                brd4 = (struct sockaddr_in *) brd;

                /* filter out unwanted addresses */
                if (bad_addr_v4(ifa4->sin_addr))
                        return;

                k.addr.v4 = ifa4->sin_addr;
                if (mask4)
                        k.prefixlen = mask2prefixlen(mask4->sin_addr.s_addr);
                if (brd4)
                        k.dstbrd.v4 = brd4->sin_addr;
                break;
        case AF_INET6:
                ifa6 = (struct sockaddr_in6 *) ifa;
                mask6 = (struct sockaddr_in6 *) mask;
                brd6 = (struct sockaddr_in6 *) brd;

                /* We only care about link-local and global-scope. */
                if (bad_addr_v6(&ifa6->sin6_addr))
                        return;

                clearscope(&ifa6->sin6_addr);

                k.addr.v6 = ifa6->sin6_addr;
                if (mask6)
                        k.prefixlen = mask2prefixlen6(mask6);
                if (brd6)
                        k.dstbrd.v6 = brd6->sin6_addr;
                break;
        default:
                return;
        }

        for (ka = TAILQ_FIRST(&kif->addrs); ka != NULL; ka = nka) {
                nka = TAILQ_NEXT(ka, entry);

                if (ka->a.af != k.af ||
                    ka->a.prefixlen != k.prefixlen ||
                    ldp_addrcmp(ka->a.af, &ka->a.addr, &k.addr))
                        continue;

                /* notify ldpe about removed address */
                main_imsg_compose_ldpe(IMSG_DELADDR, 0, &ka->a, sizeof(ka->a));
                TAILQ_REMOVE(&kif->addrs, ka, entry);
                free(ka);
                return;
        }
}

static void
if_announce(void *msg)
{
        struct if_announcemsghdr        *ifan;
        struct kif_node                 *kif;

        ifan = msg;

        switch (ifan->ifan_what) {
        case IFAN_ARRIVAL:
                kif = kif_insert(ifan->ifan_index);
                if (kif)
                        strlcpy(kif->k.ifname, ifan->ifan_name,
                            sizeof(kif->k.ifname));
                break;
        case IFAN_DEPARTURE:
                kif = kif_find(ifan->ifan_index);
                if (kif)
                        kif_remove(kif, 1);
                break;
        }
}

/* rtsock */
static int
send_rtmsg(int fd, int action, struct kroute *kr, int family)
{
        switch (kr->af) {
        case AF_INET:
                return (send_rtmsg_v4(fd, action, kr, family));
        case AF_INET6:
                return (send_rtmsg_v6(fd, action, kr, family));
        default:
                fatalx("send_rtmsg: unknown af");
        }
}

static int
send_rtmsg_v4(int fd, int action, struct kroute *kr, int family)
{
        struct iovec            iov[5];
        struct rt_msghdr        hdr;
        struct sockaddr_mpls    label_in, label_out;
        struct sockaddr_in      dst, mask, nexthop;
        int                     iovcnt = 0;

        if (kr_state.fib_sync == 0)
                return (0);

        /*
         * Reserved labels (implicit and explicit NULL) should not be added
         * to the FIB.
         */
        if (family == AF_MPLS && kr->local_label < MPLS_LABEL_RESERVED_MAX)
                return (0);

        /* initialize header */
        memset(&hdr, 0, sizeof(hdr));
        hdr.rtm_version = RTM_VERSION;

        hdr.rtm_type = action;
        hdr.rtm_flags = RTF_UP;
        hdr.rtm_fmask = RTF_MPLS;
        hdr.rtm_seq = kr_state.rtseq++; /* overflow doesn't matter */
        hdr.rtm_msglen = sizeof(hdr);
        hdr.rtm_hdrlen = sizeof(struct rt_msghdr);
        hdr.rtm_priority = kr->priority;
        hdr.rtm_tableid = kr_state.rdomain;     /* rtableid */
        /* adjust iovec */
        iov[iovcnt].iov_base = &hdr;
        iov[iovcnt++].iov_len = sizeof(hdr);

        if (family == AF_MPLS) {
                memset(&label_in, 0, sizeof(label_in));
                label_in.smpls_len = sizeof(label_in);
                label_in.smpls_family = AF_MPLS;
                label_in.smpls_label =
                    htonl(kr->local_label << MPLS_LABEL_OFFSET);
                /* adjust header */
                hdr.rtm_flags |= RTF_MPLS | RTF_MPATH;
                hdr.rtm_addrs |= RTA_DST;
                hdr.rtm_msglen += sizeof(label_in);
                /* adjust iovec */
                iov[iovcnt].iov_base = &label_in;
                iov[iovcnt++].iov_len = sizeof(label_in);
        } else {
                memset(&dst, 0, sizeof(dst));
                dst.sin_len = sizeof(dst);
                dst.sin_family = AF_INET;
                dst.sin_addr = kr->prefix.v4;
                /* adjust header */
                hdr.rtm_addrs |= RTA_DST;
                hdr.rtm_msglen += sizeof(dst);
                /* adjust iovec */
                iov[iovcnt].iov_base = &dst;
                iov[iovcnt++].iov_len = sizeof(dst);
        }

        memset(&nexthop, 0, sizeof(nexthop));
        nexthop.sin_len = sizeof(nexthop);
        nexthop.sin_family = AF_INET;
        nexthop.sin_addr = kr->nexthop.v4;
        /* adjust header */
        hdr.rtm_flags |= RTF_GATEWAY;
        hdr.rtm_addrs |= RTA_GATEWAY;
        hdr.rtm_msglen += sizeof(nexthop);
        /* adjust iovec */
        iov[iovcnt].iov_base = &nexthop;
        iov[iovcnt++].iov_len = sizeof(nexthop);

        if (family == AF_INET) {
                memset(&mask, 0, sizeof(mask));
                mask.sin_len = sizeof(mask);
                mask.sin_family = AF_INET;
                mask.sin_addr.s_addr = prefixlen2mask(kr->prefixlen);
                /* adjust header */
                hdr.rtm_addrs |= RTA_NETMASK;
                hdr.rtm_msglen += sizeof(mask);
                /* adjust iovec */
                iov[iovcnt].iov_base = &mask;
                iov[iovcnt++].iov_len = sizeof(mask);
        }

        /* If action is RTM_DELETE we have to get rid of MPLS infos */
        if (kr->remote_label != NO_LABEL && action != RTM_DELETE) {
                memset(&label_out, 0, sizeof(label_out));
                label_out.smpls_len = sizeof(label_out);
                label_out.smpls_family = AF_MPLS;
                label_out.smpls_label =
                    htonl(kr->remote_label << MPLS_LABEL_OFFSET);
                /* adjust header */
                hdr.rtm_addrs |= RTA_SRC;
                hdr.rtm_flags |= RTF_MPLS;
                hdr.rtm_msglen += sizeof(label_out);
                /* adjust iovec */
                iov[iovcnt].iov_base = &label_out;
                iov[iovcnt++].iov_len = sizeof(label_out);

                if (kr->remote_label == MPLS_LABEL_IMPLNULL) {
                        if (family == AF_MPLS)
                                hdr.rtm_mpls = MPLS_OP_POP;
                        else
                                return (0);
                } else {
                        if (family == AF_MPLS)
                                hdr.rtm_mpls = MPLS_OP_SWAP;
                        else
                                hdr.rtm_mpls = MPLS_OP_PUSH;
                }
        }

 retry:
        if (writev(fd, iov, iovcnt) == -1) {
                if (errno == ESRCH) {
                        if (hdr.rtm_type == RTM_CHANGE && family == AF_MPLS) {
                                hdr.rtm_type = RTM_ADD;
                                goto retry;
                        } else if (hdr.rtm_type == RTM_DELETE) {
                                log_info("route %s/%u vanished before delete",
                                    inet_ntoa(kr->prefix.v4), kr->prefixlen);
                                return (-1);
                        }
                }
                log_warn("%s action %u, af %s, prefix %s/%u", __func__,
                    hdr.rtm_type, af_name(family), inet_ntoa(kr->prefix.v4),
                    kr->prefixlen);
                return (-1);
        }

        return (0);
}

static int
send_rtmsg_v6(int fd, int action, struct kroute *kr, int family)
{
        struct iovec            iov[5];
        struct rt_msghdr        hdr;
        struct sockaddr_mpls    label_in, label_out;
        struct sockaddr_in6     dst, mask, nexthop;
        int                     iovcnt = 0;

        if (kr_state.fib_sync == 0)
                return (0);

        /*
         * Reserved labels (implicit and explicit NULL) should not be added
         * to the FIB.
         */
        if (family == AF_MPLS && kr->local_label < MPLS_LABEL_RESERVED_MAX)
                return (0);

        /* initialize header */
        memset(&hdr, 0, sizeof(hdr));
        hdr.rtm_version = RTM_VERSION;

        hdr.rtm_type = action;
        hdr.rtm_flags = RTF_UP;
        hdr.rtm_fmask = RTF_MPLS;
        hdr.rtm_seq = kr_state.rtseq++; /* overflow doesn't matter */
        hdr.rtm_msglen = sizeof(hdr);
        hdr.rtm_hdrlen = sizeof(struct rt_msghdr);
        hdr.rtm_priority = kr->priority;
        hdr.rtm_tableid = kr_state.rdomain;     /* rtableid */
        /* adjust iovec */
        iov[iovcnt].iov_base = &hdr;
        iov[iovcnt++].iov_len = sizeof(hdr);

        if (family == AF_MPLS) {
                memset(&label_in, 0, sizeof(label_in));
                label_in.smpls_len = sizeof(label_in);
                label_in.smpls_family = AF_MPLS;
                label_in.smpls_label =
                    htonl(kr->local_label << MPLS_LABEL_OFFSET);
                /* adjust header */
                hdr.rtm_flags |= RTF_MPLS | RTF_MPATH;
                hdr.rtm_addrs |= RTA_DST;
                hdr.rtm_msglen += sizeof(label_in);
                /* adjust iovec */
                iov[iovcnt].iov_base = &label_in;
                iov[iovcnt++].iov_len = sizeof(label_in);
        } else {
                memset(&dst, 0, sizeof(dst));
                dst.sin6_len = sizeof(dst);
                dst.sin6_family = AF_INET6;
                dst.sin6_addr = kr->prefix.v6;
                /* adjust header */
                hdr.rtm_addrs |= RTA_DST;
                hdr.rtm_msglen += ROUNDUP(sizeof(dst));
                /* adjust iovec */
                iov[iovcnt].iov_base = &dst;
                iov[iovcnt++].iov_len = ROUNDUP(sizeof(dst));
        }

        memset(&nexthop, 0, sizeof(nexthop));
        nexthop.sin6_len = sizeof(nexthop);
        nexthop.sin6_family = AF_INET6;
        nexthop.sin6_addr = kr->nexthop.v6;
        nexthop.sin6_scope_id = kr->ifindex;
        /*
         * XXX we should set the sin6_scope_id but the kernel
         * XXX does not expect it that way. It must be fiddled
         * XXX into the sin6_addr. Welcome to the typical
         * XXX IPv6 insanity and all without wine bottles.
         */
        embedscope(&nexthop);

        /* adjust header */
        hdr.rtm_flags |= RTF_GATEWAY;
        hdr.rtm_addrs |= RTA_GATEWAY;
        hdr.rtm_msglen += ROUNDUP(sizeof(nexthop));
        /* adjust iovec */
        iov[iovcnt].iov_base = &nexthop;
        iov[iovcnt++].iov_len = ROUNDUP(sizeof(nexthop));

        if (family == AF_INET6) {
                memset(&mask, 0, sizeof(mask));
                mask.sin6_len = sizeof(mask);
                mask.sin6_family = AF_INET6;
                mask.sin6_addr = *prefixlen2mask6(kr->prefixlen);
                /* adjust header */
                if (kr->prefixlen == 128)
                        hdr.rtm_flags |= RTF_HOST;
                hdr.rtm_addrs |= RTA_NETMASK;
                hdr.rtm_msglen += ROUNDUP(sizeof(mask));
                /* adjust iovec */
                iov[iovcnt].iov_base = &mask;
                iov[iovcnt++].iov_len = ROUNDUP(sizeof(mask));
        }

        /* If action is RTM_DELETE we have to get rid of MPLS infos */
        if (kr->remote_label != NO_LABEL && action != RTM_DELETE) {
                memset(&label_out, 0, sizeof(label_out));
                label_out.smpls_len = sizeof(label_out);
                label_out.smpls_family = AF_MPLS;
                label_out.smpls_label =
                    htonl(kr->remote_label << MPLS_LABEL_OFFSET);
                /* adjust header */
                hdr.rtm_addrs |= RTA_SRC;
                hdr.rtm_flags |= RTF_MPLS;
                hdr.rtm_msglen += sizeof(label_out);
                /* adjust iovec */
                iov[iovcnt].iov_base = &label_out;
                iov[iovcnt++].iov_len = sizeof(label_out);

                if (kr->remote_label == MPLS_LABEL_IMPLNULL) {
                        if (family == AF_MPLS)
                                hdr.rtm_mpls = MPLS_OP_POP;
                        else
                                return (0);
                } else {
                        if (family == AF_MPLS)
                                hdr.rtm_mpls = MPLS_OP_SWAP;
                        else
                                hdr.rtm_mpls = MPLS_OP_PUSH;
                }
        }

 retry:
        if (writev(fd, iov, iovcnt) == -1) {
                if (errno == ESRCH) {
                        if (hdr.rtm_type == RTM_CHANGE && family == AF_MPLS) {
                                hdr.rtm_type = RTM_ADD;
                                goto retry;
                        } else if (hdr.rtm_type == RTM_DELETE) {
                                log_info("route %s/%u vanished before delete",
                                    log_addr(kr->af, &kr->prefix),
                                    kr->prefixlen);
                                return (-1);
                        }
                }
                log_warn("%s action %u, af %s, prefix %s/%u", __func__,
                    hdr.rtm_type, af_name(family), log_addr(kr->af,
                    &kr->prefix), kr->prefixlen);
                return (-1);
        }
        return (0);
}

static int
fetchtable(void)
{
        size_t                   len;
        int                      mib[7];
        char                    *buf;
        int                      rv;

        mib[0] = CTL_NET;
        mib[1] = PF_ROUTE;
        mib[2] = 0;
        mib[3] = 0;
        mib[4] = NET_RT_DUMP;
        mib[5] = 0;
        mib[6] = kr_state.rdomain;      /* rtableid */

        if (sysctl(mib, 7, NULL, &len, NULL, 0) == -1) {
                log_warn("sysctl");
                return (-1);
        }
        if ((buf = malloc(len)) == NULL) {
                log_warn(__func__);
                return (-1);
        }
        if (sysctl(mib, 7, buf, &len, NULL, 0) == -1) {
                log_warn("sysctl");
                free(buf);
                return (-1);
        }

        rv = rtmsg_process(buf, len);
        free(buf);

        return (rv);
}

static int
fetchifs(void)
{
        size_t                   len;
        int                      mib[6];
        char                    *buf;
        int                      rv;

        mib[0] = CTL_NET;
        mib[1] = PF_ROUTE;
        mib[2] = 0;
        mib[3] = 0;     /* wildcard */
        mib[4] = NET_RT_IFLIST;
        mib[5] = 0;

        if (sysctl(mib, 6, NULL, &len, NULL, 0) == -1) {
                log_warn("sysctl");
                return (-1);
        }
        if ((buf = malloc(len)) == NULL) {
                log_warn(__func__);
                return (-1);
        }
        if (sysctl(mib, 6, buf, &len, NULL, 0) == -1) {
                log_warn("sysctl");
                free(buf);
                return (-1);
        }

        rv = rtmsg_process(buf, len);
        free(buf);

        return (rv);
}

static int
dispatch_rtmsg(void)
{
        char                     buf[RT_BUF_SIZE];
        ssize_t                  n;

        if ((n = read(kr_state.fd, &buf, sizeof(buf))) == -1) {
                if (errno == EAGAIN || errno == EINTR)
                        return (0);
                log_warn("%s: read error", __func__);
                return (-1);
        }

        if (n == 0) {
                log_warnx("routing socket closed");
                return (-1);
        }

        return (rtmsg_process(buf, n));
}

static int
rtmsg_process(char *buf, size_t len)
{
        struct rt_msghdr        *rtm;
        struct if_msghdr         ifm;
        struct ifa_msghdr       *ifam;
        struct sockaddr         *sa, *rti_info[RTAX_MAX];
        size_t                   offset;
        char                    *next;

        for (offset = 0; offset < len; offset += rtm->rtm_msglen) {
                next = buf + offset;
                rtm = (struct rt_msghdr *)next;
                if (len < offset + sizeof(unsigned short) ||
                    len < offset + rtm->rtm_msglen)
                        fatalx("rtmsg_process: partial rtm in buffer");
                if (rtm->rtm_version != RTM_VERSION)
                        continue;

                sa = (struct sockaddr *)(next + rtm->rtm_hdrlen);
                get_rtaddrs(rtm->rtm_addrs, sa, rti_info);

                switch (rtm->rtm_type) {
                case RTM_ADD:
                case RTM_GET:
                case RTM_CHANGE:
                case RTM_DELETE:
                        if (rtm->rtm_errno)             /* failed attempts... */
                                continue;

                        if (rtm->rtm_tableid != kr_state.rdomain)
                                continue;

                        if (rtm->rtm_type == RTM_GET &&
                            rtm->rtm_pid != kr_state.pid)
                                continue;

                        /* Skip ARP/ND cache and broadcast routes. */
                        if (rtm->rtm_flags & (RTF_LLINFO|RTF_BROADCAST))
                                continue;

                        /* LDP should follow the IGP and ignore BGP routes */
                        if (rtm->rtm_priority == RTP_BGP)
                                continue;

                        if (rtmsg_process_route(rtm, rti_info) == -1)
                                return (-1);
                }

                switch (rtm->rtm_type) {
                case RTM_IFINFO:
                        memcpy(&ifm, next, sizeof(ifm));
                        if_change(ifm.ifm_index, ifm.ifm_flags, &ifm.ifm_data,
                            (struct sockaddr_dl *)rti_info[RTAX_IFP]);
                        break;
                case RTM_NEWADDR:
                        ifam = (struct ifa_msghdr *)rtm;
                        if ((ifam->ifam_addrs & (RTA_NETMASK | RTA_IFA |
                            RTA_BRD)) == 0)
                                break;

                        if_newaddr(ifam->ifam_index,
                            (struct sockaddr *)rti_info[RTAX_IFA],
                            (struct sockaddr *)rti_info[RTAX_NETMASK],
                            (struct sockaddr *)rti_info[RTAX_BRD]);
                        break;
                case RTM_DELADDR:
                        ifam = (struct ifa_msghdr *)rtm;
                        if ((ifam->ifam_addrs & (RTA_NETMASK | RTA_IFA |
                            RTA_BRD)) == 0)
                                break;

                        if_deladdr(ifam->ifam_index,
                            (struct sockaddr *)rti_info[RTAX_IFA],
                            (struct sockaddr *)rti_info[RTAX_NETMASK],
                            (struct sockaddr *)rti_info[RTAX_BRD]);
                        break;
                case RTM_IFANNOUNCE:
                        if_announce(next);
                        break;
                default:
                        /* ignore for now */
                        break;
                }
        }

        return (offset);
}

static int
rtmsg_process_route(struct rt_msghdr *rtm, struct sockaddr *rti_info[RTAX_MAX])
{
        struct sockaddr         *sa;
        struct sockaddr_in      *sa_in;
        struct sockaddr_in6     *sa_in6;
        struct kroute            kr;
        struct kroute_prefix    *kp;
        struct kroute_priority  *kprio;
        struct kroute_node      *kn;

        if ((sa = rti_info[RTAX_DST]) == NULL)
                return (-1);

        memset(&kr, 0, sizeof(kr));
        kr.af = sa->sa_family;
        switch (kr.af) {
        case AF_INET:
                kr.prefix.v4 = ((struct sockaddr_in *)sa)->sin_addr;
                sa_in = (struct sockaddr_in *) rti_info[RTAX_NETMASK];
                if (sa_in != NULL && sa_in->sin_len != 0)
                        kr.prefixlen = mask2prefixlen(sa_in->sin_addr.s_addr);
                else if (rtm->rtm_flags & RTF_HOST)
                        kr.prefixlen = 32;
                else if (kr.prefix.v4.s_addr == INADDR_ANY)
                        kr.prefixlen = 0;
                else
                        kr.prefixlen = prefixlen_classful(kr.prefix.v4.s_addr);
                break;
        case AF_INET6:
                kr.prefix.v6 = ((struct sockaddr_in6 *)sa)->sin6_addr;
                sa_in6 = (struct sockaddr_in6 *)rti_info[RTAX_NETMASK];
                if (sa_in6 != NULL && sa_in6->sin6_len != 0)
                        kr.prefixlen = mask2prefixlen6(sa_in6);
                else if (rtm->rtm_flags & RTF_HOST)
                        kr.prefixlen = 128;
                else if (IN6_IS_ADDR_UNSPECIFIED(&kr.prefix.v6))
                        kr.prefixlen = 0;
                else
                        fatalx("in6 net addr without netmask");
                break;
        default:
                return (0);
        }
        kr.ifindex = rtm->rtm_index;
        if ((sa = rti_info[RTAX_GATEWAY]) != NULL) {
                switch (sa->sa_family) {
                case AF_INET:
                        kr.nexthop.v4 = ((struct sockaddr_in *)sa)->sin_addr;
                        break;
                case AF_INET6:
                        sa_in6 = (struct sockaddr_in6 *)sa;
                        recoverscope(sa_in6);
                        kr.nexthop.v6 = sa_in6->sin6_addr;
                        if (sa_in6->sin6_scope_id)
                                kr.ifindex = sa_in6->sin6_scope_id;
                        break;
                case AF_LINK:
                        kr.flags |= F_CONNECTED;
                        break;
                }
        }

        if (rtm->rtm_flags & RTF_STATIC)
                kr.flags |= F_STATIC;
        if (rtm->rtm_flags & RTF_BLACKHOLE)
                kr.flags |= F_BLACKHOLE;
        if (rtm->rtm_flags & RTF_REJECT)
                kr.flags |= F_REJECT;
        if (rtm->rtm_flags & RTF_DYNAMIC)
                kr.flags |= F_DYNAMIC;
        /* routes attached to connected or loopback interfaces */
        if (rtm->rtm_flags & RTF_CONNECTED ||
            ldp_addrcmp(kr.af, &kr.prefix, &kr.nexthop) == 0)
                kr.flags |= F_CONNECTED;
        kr.priority = rtm->rtm_priority;

        if (rtm->rtm_type == RTM_CHANGE) {
                /*
                 * The kernel doesn't allow RTM_CHANGE for multipath routes.
                 * If we got this message we know that the route has only one
                 * nexthop and we should remove it before installing the same
                 * route with the new nexthop.
                 */
                kp = kroute_find_prefix(kr.af, &kr.prefix, kr.prefixlen);
                if (kp) {
                        kprio = kroute_find_prio(kp, kr.priority);
                        if (kprio) {
                                kn = TAILQ_FIRST(&kprio->nexthops);
                                if (kn)
                                        kroute_remove(&kn->r);
                        }
                }
        }

        kn = NULL;
        kp = kroute_find_prefix(kr.af, &kr.prefix, kr.prefixlen);
        if (kp) {
                kprio = kroute_find_prio(kp, kr.priority);
                if (kprio)
                        kn = kroute_find_gw(kprio, &kr.nexthop);
        }

        if (rtm->rtm_type == RTM_DELETE) {
                if (kn == NULL)
                        return (0);
                return (kroute_remove(&kr));
        }

        if (!ldp_addrisset(kr.af, &kr.nexthop) && !(kr.flags & F_CONNECTED)) {
                log_warnx("%s: no nexthop for %s/%u", __func__,
                    log_addr(kr.af, &kr.prefix), kr.prefixlen);
                return (-1);
        }

        if (kn != NULL) {
                /* update route */
                kn->r = kr;
                kr_redistribute(kp);
        } else {
                kr.local_label = NO_LABEL;
                kr.remote_label = NO_LABEL;
                kroute_insert(&kr);
        }

        return (0);
}

int
kmpw_set(struct kpw *kpw)
{
        struct kif_node         *kif;

        kif = kif_find(kpw->ifindex);
        if (kif == NULL) {
                log_warnx("%s: failed to find mpw by index (%u)", __func__,
                    kpw->ifindex);
                return (-1);
        }

        if (kif->kpw == NULL)
                kif->kpw = malloc(sizeof(*kif->kpw));
        *kif->kpw = *kpw;

        return (kmpw_install(kif->k.ifname, kpw));
}

int
kmpw_unset(struct kpw *kpw)
{
        struct kif_node         *kif;

        kif = kif_find(kpw->ifindex);
        if (kif == NULL) {
                log_warnx("%s: failed to find mpw by index (%u)", __func__,
                    kpw->ifindex);
                return (-1);
        }

        if (kif->kpw == NULL) {
                log_warnx("%s: %s is not set", __func__, kif->k.ifname);
                return (-1);
        }

        free(kif->kpw);
        kif->kpw = NULL;
        return (kmpw_uninstall(kif->k.ifname));
}

static int
kmpw_install(const char *ifname, struct kpw *kpw)
{
        struct ifreq             ifr;
        struct ifmpwreq          imr;

        memset(&imr, 0, sizeof(imr));
        switch (kpw->pw_type) {
        case PW_TYPE_ETHERNET:
                imr.imr_type = IMR_TYPE_ETHERNET;
                break;
        case PW_TYPE_ETHERNET_TAGGED:
                imr.imr_type = IMR_TYPE_ETHERNET_TAGGED;
                break;
        default:
                log_warnx("%s: unhandled pseudowire type (%#X)", __func__,
                    kpw->pw_type);
                return (-1);
        }

        if (kpw->flags & F_PW_CWORD)
                imr.imr_flags |= IMR_FLAG_CONTROLWORD;

        memcpy(&imr.imr_nexthop, addr2sa(kpw->af, &kpw->nexthop, 0),
            sizeof(imr.imr_nexthop));

        imr.imr_lshim.shim_label = kpw->local_label;
        imr.imr_rshim.shim_label = kpw->remote_label;

        memset(&ifr, 0, sizeof(ifr));
        strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
        ifr.ifr_data = (caddr_t) &imr;
        if (ioctl(kr_state.ioctl_fd, SIOCSETMPWCFG, &ifr) == -1) {
                log_warn("ioctl SIOCSETMPWCFG");
                return (-1);
        }

        return (0);
}

static int
kmpw_uninstall(const char *ifname)
{
        struct ifreq             ifr;
        struct ifmpwreq          imr;

        memset(&ifr, 0, sizeof(ifr));
        memset(&imr, 0, sizeof(imr));
        strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name));
        ifr.ifr_data = (caddr_t) &imr;
        if (ioctl(kr_state.ioctl_fd, SIOCSETMPWCFG, &ifr) == -1) {
                log_warn("ioctl SIOCSETMPWCFG");
                return (-1);
        }

        return (0);
}

int
kmpw_find(const char *ifname)
{
        struct ifreq             ifr;

        memset(&ifr, 0, sizeof(ifr));
        if (strlcpy(ifr.ifr_name, ifname, sizeof(ifr.ifr_name)) >=
            sizeof(ifr.ifr_name)) {
                errno = ENAMETOOLONG;
                return (-1);
        }

        if (ioctl(kr_state.ioctl_fd, SIOCGPWE3, &ifr) == -1)
                return (-1);

        if (ifr.ifr_pwe3 != IF_PWE3_ETHERNET) {
                errno = EPFNOSUPPORT;
                return (-1);
        }

        return (0);
}