root/tools/tools/netmap/pkt-gen.c
/*
 * Copyright (C) 2011-2014 Matteo Landi, Luigi Rizzo. All rights reserved.
 * Copyright (C) 2013-2015 Universita` di Pisa. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *   1. Redistributions of source code must retain the above copyright
 *      notice, this list of conditions and the following disclaimer.
 *   2. Redistributions in binary form must reproduce the above copyright
 *      notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/*
 * $Id: pkt-gen.c 12346 2013-06-12 17:36:25Z luigi $
 *
 * Example program to show how to build a multithreaded packet
 * source/sink using the netmap device.
 *
 * In this example we create a programmable number of threads
 * to take care of all the queues of the interface used to
 * send or receive traffic.
 *
 */

#define _GNU_SOURCE     /* for CPU_SET() */
#include <arpa/inet.h>  /* ntohs */
#include <assert.h>
#include <ctype.h>      // isprint()
#include <errno.h>
#include <fcntl.h>
#include <ifaddrs.h>    /* getifaddrs */
#include <libnetmap.h>
#include <math.h>
#include <net/ethernet.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <netinet/udp.h>
#ifndef NO_PCAP
#include <pcap/pcap.h>
#endif
#include <pthread.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/ioctl.h>
#include <sys/poll.h>
#include <sys/stat.h>
#if !defined(_WIN32) && !defined(linux)
#include <sys/sysctl.h> /* sysctl */
#endif
#include <sys/types.h>
#include <unistd.h>     // sysconf()
#ifdef linux
#define IPV6_VERSION    0x60
#define IPV6_DEFHLIM    64
#endif

#include "ctrs.h"

static void usage(int);

#ifdef _WIN32
#define cpuset_t        DWORD_PTR   //uint64_t
static inline void CPU_ZERO(cpuset_t *p)
{
        *p = 0;
}

static inline void CPU_SET(uint32_t i, cpuset_t *p)
{
        *p |= 1<< (i & 0x3f);
}

#define pthread_setaffinity_np(a, b, c) !SetThreadAffinityMask(a, *c)    //((void)a, 0)
#define TAP_CLONEDEV    "/dev/tap"
#define AF_LINK 18      //defined in winsocks.h
#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
#include <net/if_dl.h>

/*
 * Convert an ASCII representation of an ethernet address to
 * binary form.
 */
struct ether_addr *
ether_aton(const char *a)
{
        int i;
        static struct ether_addr o;
        unsigned int o0, o1, o2, o3, o4, o5;

        i = sscanf(a, "%x:%x:%x:%x:%x:%x", &o0, &o1, &o2, &o3, &o4, &o5);

        if (i != 6)
                return (NULL);

        o.octet[0]=o0;
        o.octet[1]=o1;
        o.octet[2]=o2;
        o.octet[3]=o3;
        o.octet[4]=o4;
        o.octet[5]=o5;

        return ((struct ether_addr *)&o);
}

/*
 * Convert a binary representation of an ethernet address to
 * an ASCII string.
 */
char *
ether_ntoa(const struct ether_addr *n)
{
        int i;
        static char a[18];

        i = sprintf(a, "%02x:%02x:%02x:%02x:%02x:%02x",
            n->octet[0], n->octet[1], n->octet[2],
            n->octet[3], n->octet[4], n->octet[5]);
        return (i < 17 ? NULL : (char *)&a);
}
#endif /* _WIN32 */

#ifdef linux

#define cpuset_t        cpu_set_t

#define ifr_flagshigh  ifr_flags        /* only the low 16 bits here */
#define IFF_PPROMISC   IFF_PROMISC      /* IFF_PPROMISC does not exist */
#include <linux/ethtool.h>
#include <linux/sockios.h>

#define CLOCK_REALTIME_PRECISE CLOCK_REALTIME
#include <netinet/ether.h>      /* ether_aton */
#include <linux/if_packet.h>    /* sockaddr_ll */
#endif  /* linux */

#ifdef __FreeBSD__
#include <sys/endian.h> /* le64toh */
#include <machine/param.h>

#include <pthread_np.h> /* pthread w/ affinity */
#include <sys/cpuset.h> /* cpu_set */
#include <net/if_dl.h>  /* LLADDR */
#endif  /* __FreeBSD__ */

#ifdef __APPLE__

#define cpuset_t        uint64_t        // XXX
static inline void CPU_ZERO(cpuset_t *p)
{
        *p = 0;
}

static inline void CPU_SET(uint32_t i, cpuset_t *p)
{
        *p |= 1<< (i & 0x3f);
}

#define pthread_setaffinity_np(a, b, c) ((void)a, 0)

#define ifr_flagshigh  ifr_flags        // XXX
#define IFF_PPROMISC   IFF_PROMISC
#include <net/if_dl.h>  /* LLADDR */
#define clock_gettime(a,b)      \
        do {struct timespec t0 = {0,0}; *(b) = t0; } while (0)
#endif  /* __APPLE__ */

static const char *default_payload = "netmap pkt-gen DIRECT payload\n"
        "http://info.iet.unipi.it/~luigi/netmap/ ";

static const char *indirect_payload = "netmap pkt-gen indirect payload\n"
        "http://info.iet.unipi.it/~luigi/netmap/ ";

static int verbose = 0;
static int normalize = 1;

#define VIRT_HDR_1      10      /* length of a base vnet-hdr */
#define VIRT_HDR_2      12      /* length of the extenede vnet-hdr */
#define VIRT_HDR_MAX    VIRT_HDR_2
struct virt_header {
        uint8_t fields[VIRT_HDR_MAX];
};

#define MAX_BODYSIZE    65536

struct pkt {
        struct virt_header vh;
        struct ether_header eh;
        union {
                struct {
                        struct ip ip;
                        struct udphdr udp;
                        uint8_t body[MAX_BODYSIZE];     /* hardwired */
                } ipv4;
                struct {
                        struct ip6_hdr ip;
                        struct udphdr udp;
                        uint8_t body[MAX_BODYSIZE];     /* hardwired */
                } ipv6;
        };
} __attribute__((__packed__));

#define PKT(p, f, af)   \
    ((af) == AF_INET ? (p)->ipv4.f: (p)->ipv6.f)

struct ip_range {
        const char *name;
        union {
                struct {
                        uint32_t start, end; /* same as struct in_addr */
                } ipv4;
                struct {
                        struct in6_addr start, end;
                        uint8_t sgroup, egroup;
                } ipv6;
        };
        uint16_t port0, port1;
};

struct mac_range {
        const char *name;
        struct ether_addr start, end;
};

/* ifname can be netmap:foo-xxxx */
#define MAX_IFNAMELEN   512     /* our buffer for ifname */
//#define MAX_PKTSIZE   1536
#define MAX_PKTSIZE     MAX_BODYSIZE    /* XXX: + IP_HDR + ETH_HDR */

/* compact timestamp to fit into 60 byte packet. (enough to obtain RTT) */
struct tstamp {
        uint32_t sec;
        uint32_t nsec;
};

/*
 * global arguments for all threads
 */

struct glob_arg {
        int af;         /* address family AF_INET/AF_INET6 */
        struct ip_range src_ip;
        struct ip_range dst_ip;
        struct mac_range dst_mac;
        struct mac_range src_mac;
        int pkt_size;
        int pkt_min_size;
        int burst;
        int forever;
        uint64_t npackets;      /* total packets to send */
        int frags;              /* fragments per packet */
        u_int frag_size;        /* size of each fragment */
        int nthreads;
        int cpus;       /* cpus used for running */
        int system_cpus;        /* cpus on the system */

        int options;    /* testing */
#define OPT_PREFETCH    1
#define OPT_ACCESS      2
#define OPT_COPY        4
#define OPT_MEMCPY      8
#define OPT_TS          16      /* add a timestamp */
#define OPT_INDIRECT    32      /* use indirect buffers, tx only */
#define OPT_DUMP        64      /* dump rx/tx traffic */
#define OPT_RUBBISH     256     /* send whatever the buffers contain */
#define OPT_RANDOM_SRC  512
#define OPT_RANDOM_DST  1024
#define OPT_PPS_STATS   2048
#define OPT_UPDATE_CSUM 4096
        int dev_type;
#ifndef NO_PCAP
        pcap_t *p;
#endif

        int tx_rate;
        struct timespec tx_period;

        int affinity;
        int main_fd;
        struct nmport_d *nmd;
        uint32_t orig_mode;
        int report_interval;            /* milliseconds between prints */
        void *(*td_body)(void *);
        int td_type;
        void *mmap_addr;
        char ifname[MAX_IFNAMELEN];
        const char *nmr_config;
        int dummy_send;
        int virt_header;        /* send also the virt_header */
        char *packet_file;      /* -P option */
#define STATS_WIN       15
        int win_idx;
        int64_t win[STATS_WIN];
        int wait_link;
        int framing;            /* #bits of framing (for bw output) */
};
enum dev_type { DEV_NONE, DEV_NETMAP, DEV_PCAP, DEV_TAP };

enum {
        TD_TYPE_SENDER = 1,
        TD_TYPE_RECEIVER,
        TD_TYPE_OTHER,
};

/*
 * Arguments for a new thread. The same structure is used by
 * the source and the sink
 */
struct targ {
        struct glob_arg *g;
        int used;
        int completed;
        int cancel;
        int fd;
        struct nmport_d *nmd;
        /* these ought to be volatile, but they are
         * only sampled and errors should not accumulate
         */
        struct my_ctrs ctr;

        struct timespec tic, toc;
        int me;
        pthread_t thread;
        int affinity;

        struct pkt pkt;
        void *frame;
        uint16_t seed[3];
        u_int frags;
        u_int frag_size;
};

static __inline uint16_t
cksum_add(uint16_t sum, uint16_t a)
{
        uint16_t res;

        res = sum + a;
        return (res + (res < a));
}

static void
extract_ipv4_addr(char *name, uint32_t *addr, uint16_t *port)
{
        struct in_addr a;
        char *pp;

        pp = strchr(name, ':');
        if (pp != NULL) {       /* do we have ports ? */
                *pp++ = '\0';
                *port = (uint16_t)strtol(pp, NULL, 0);
        }

        inet_pton(AF_INET, name, &a);
        *addr = ntohl(a.s_addr);
}

static void
extract_ipv6_addr(char *name, struct in6_addr *addr, uint16_t *port,
    uint8_t *group)
{
        char *pp;

        /*
         * We accept IPv6 address in the following form:
         *  group@[2001:DB8::1001]:port (w/ brackets and port)
         *  group@[2001:DB8::1]         (w/ brackets and w/o port)
         *  group@2001:DB8::1234        (w/o brackets and w/o port)
         */
        pp = strchr(name, '@');
        if (pp != NULL) {
                *pp++ = '\0';
                *group = (uint8_t)strtol(name, NULL, 0);
                if (*group > 7)
                        *group = 7;
                name = pp;
        }
        if (name[0] == '[')
                name++;
        pp = strchr(name, ']');
        if (pp != NULL)
                *pp++ = '\0';
        if (pp != NULL && *pp != ':')
                pp = NULL;
        if (pp != NULL) {       /* do we have ports ? */
                *pp++ = '\0';
                *port = (uint16_t)strtol(pp, NULL, 0);
        }
        inet_pton(AF_INET6, name, addr);
}
/*
 * extract the extremes from a range of ipv4 addresses.
 * addr_lo[-addr_hi][:port_lo[-port_hi]]
 */
static int
extract_ip_range(struct ip_range *r, int af)
{
        char *name, *ap, start[INET6_ADDRSTRLEN];
        char end[INET6_ADDRSTRLEN];
        struct in_addr a;
        uint32_t tmp;

        if (verbose)
                D("extract IP range from %s", r->name);

        name = strdup(r->name);
        if (name == NULL) {
                D("strdup failed");
                usage(-1);
        }
        /* the first - splits start/end of range */
        ap = strchr(name, '-');
        if (ap != NULL)
                *ap++ = '\0';
        r->port0 = 1234;        /* default port */
        if (af == AF_INET6) {
                r->ipv6.sgroup = 7; /* default group */
                extract_ipv6_addr(name, &r->ipv6.start, &r->port0,
                    &r->ipv6.sgroup);
        } else
                extract_ipv4_addr(name, &r->ipv4.start, &r->port0);

        r->port1 = r->port0;
        if (af == AF_INET6) {
                if (ap != NULL) {
                        r->ipv6.egroup = r->ipv6.sgroup;
                        extract_ipv6_addr(ap, &r->ipv6.end, &r->port1,
                            &r->ipv6.egroup);
                } else {
                        r->ipv6.end = r->ipv6.start;
                        r->ipv6.egroup = r->ipv6.sgroup;
                }
        } else {
                if (ap != NULL) {
                        extract_ipv4_addr(ap, &r->ipv4.end, &r->port1);
                        if (r->ipv4.start > r->ipv4.end) {
                                tmp = r->ipv4.end;
                                r->ipv4.end = r->ipv4.start;
                                r->ipv4.start = tmp;
                        }
                } else
                        r->ipv4.end = r->ipv4.start;
        }

        if (r->port0 > r->port1) {
                tmp = r->port0;
                r->port0 = r->port1;
                r->port1 = tmp;
        }
        if (af == AF_INET) {
                a.s_addr = htonl(r->ipv4.start);
                inet_ntop(af, &a, start, sizeof(start));
                a.s_addr = htonl(r->ipv4.end);
                inet_ntop(af, &a, end, sizeof(end));
        } else {
                inet_ntop(af, &r->ipv6.start, start, sizeof(start));
                inet_ntop(af, &r->ipv6.end, end, sizeof(end));
        }
        if (af == AF_INET)
                D("range is %s:%d to %s:%d", start, r->port0, end, r->port1);
        else
                D("range is %d@[%s]:%d to %d@[%s]:%d", r->ipv6.sgroup,
                    start, r->port0, r->ipv6.egroup, end, r->port1);

        free(name);
        if (r->port0 != r->port1 ||
            (af == AF_INET && r->ipv4.start != r->ipv4.end) ||
            (af == AF_INET6 &&
                !IN6_ARE_ADDR_EQUAL(&r->ipv6.start, &r->ipv6.end)))
                return (OPT_COPY);
        return (0);
}

static int
extract_mac_range(struct mac_range *r)
{
        struct ether_addr *e;
        if (verbose)
            D("extract MAC range from %s", r->name);

        e = ether_aton(r->name);
        if (e == NULL) {
                D("invalid MAC address '%s'", r->name);
                return 1;
        }
        bcopy(e, &r->start, 6);
        bcopy(e, &r->end, 6);
#if 0
        bcopy(targ->src_mac, eh->ether_shost, 6);
        p = index(targ->g->src_mac, '-');
        if (p)
                targ->src_mac_range = atoi(p+1);

        bcopy(ether_aton(targ->g->dst_mac), targ->dst_mac, 6);
        bcopy(targ->dst_mac, eh->ether_dhost, 6);
        p = index(targ->g->dst_mac, '-');
        if (p)
                targ->dst_mac_range = atoi(p+1);
#endif
        if (verbose)
                D("%s starts at %s", r->name, ether_ntoa(&r->start));
        return 0;
}

static int
get_if_mtu(const struct glob_arg *g)
{
        struct ifreq ifreq;
        int s, ret;
        const char *ifname = g->nmd->hdr.nr_name;
        size_t len;

        if (!strncmp(g->ifname, "netmap:", 7) && !strchr(ifname, '{')
                        && !strchr(ifname, '}')) {

                len = strlen(ifname);

                if (len > IFNAMSIZ) {
                        D("'%s' too long, cannot ask for MTU", ifname);
                        return -1;
                }

                s = socket(AF_INET, SOCK_DGRAM, 0);
                if (s < 0) {
                        D("socket() failed: %s", strerror(errno));
                        return s;
                }

                memset(&ifreq, 0, sizeof(ifreq));
                memcpy(ifreq.ifr_name, ifname, len);

                ret = ioctl(s, SIOCGIFMTU, &ifreq);
                if (ret) {
                        D("ioctl(SIOCGIFMTU) failed: %s", strerror(errno));
                }

                close(s);

                return ifreq.ifr_mtu;
        }

        /* This is a pipe or a VALE port, where the MTU is very large,
         * so we use some practical limit. */
        return 65536;
}

static struct targ *targs;
static int global_nthreads;

/* control-C handler */
static void
sigint_h(int sig)
{
        int i;

        (void)sig;      /* UNUSED */
        D("received control-C on thread %p", (void *)pthread_self());
        for (i = 0; i < global_nthreads; i++) {
                targs[i].cancel = 1;
        }
}

/* sysctl wrapper to return the number of active CPUs */
static int
system_ncpus(void)
{
        int ncpus;
#if defined (__FreeBSD__)
        int mib[2] = { CTL_HW, HW_NCPU };
        size_t len = sizeof(mib);
        sysctl(mib, 2, &ncpus, &len, NULL, 0);
#elif defined(linux)
        ncpus = sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(_WIN32)
        {
                SYSTEM_INFO sysinfo;
                GetSystemInfo(&sysinfo);
                ncpus = sysinfo.dwNumberOfProcessors;
        }
#else /* others */
        ncpus = 1;
#endif /* others */
        return (ncpus);
}

#ifdef __linux__
#define sockaddr_dl    sockaddr_ll
#define sdl_family     sll_family
#define AF_LINK        AF_PACKET
#define LLADDR(s)      s->sll_addr;
#include <linux/if_tun.h>
#define TAP_CLONEDEV    "/dev/net/tun"
#endif /* __linux__ */

#ifdef __FreeBSD__
#include <net/if_tun.h>
#define TAP_CLONEDEV    "/dev/tap"
#endif /* __FreeBSD */

#ifdef __APPLE__
// #warning TAP not supported on apple ?
#include <net/if_utun.h>
#define TAP_CLONEDEV    "/dev/tap"
#endif /* __APPLE__ */


/*
 * parse the vale configuration in conf and put it in nmr.
 * Return the flag set if necessary.
 * The configuration may consist of 1 to 4 numbers separated
 * by commas: #tx-slots,#rx-slots,#tx-rings,#rx-rings.
 * Missing numbers or zeroes stand for default values.
 * As an additional convenience, if exactly one number
 * is specified, then this is assigned to both #tx-slots and #rx-slots.
 * If there is no 4th number, then the 3rd is assigned to both #tx-rings
 * and #rx-rings.
 */
static int
parse_nmr_config(const char* conf, struct nmreq_register *nmr)
{
        char *w, *tok;
        int i, v;

        if (conf == NULL || ! *conf)
                return 0;
        nmr->nr_tx_rings = nmr->nr_rx_rings = 0;
        nmr->nr_tx_slots = nmr->nr_rx_slots = 0;
        w = strdup(conf);
        for (i = 0, tok = strtok(w, ","); tok; i++, tok = strtok(NULL, ",")) {
                v = atoi(tok);
                switch (i) {
                case 0:
                        nmr->nr_tx_slots = nmr->nr_rx_slots = v;
                        break;
                case 1:
                        nmr->nr_rx_slots = v;
                        break;
                case 2:
                        nmr->nr_tx_rings = nmr->nr_rx_rings = v;
                        break;
                case 3:
                        nmr->nr_rx_rings = v;
                        break;
                default:
                        D("ignored config: %s", tok);
                        break;
                }
        }
        D("txr %d txd %d rxr %d rxd %d",
                        nmr->nr_tx_rings, nmr->nr_tx_slots,
                        nmr->nr_rx_rings, nmr->nr_rx_slots);
        free(w);
        return 0;
}


/*
 * locate the src mac address for our interface, put it
 * into the user-supplied buffer. return 0 if ok, -1 on error.
 */
static int
source_hwaddr(const char *ifname, char *buf)
{
        struct ifaddrs *ifaphead, *ifap;

        if (getifaddrs(&ifaphead) != 0) {
                D("getifaddrs %s failed", ifname);
                return (-1);
        }

        /* remove 'netmap:' prefix before comparing interfaces */
        if (!strncmp(ifname, "netmap:", 7))
                ifname = &ifname[7];

        for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) {
                struct sockaddr_dl *sdl =
                        (struct sockaddr_dl *)ifap->ifa_addr;
                uint8_t *mac;

                if (!sdl || sdl->sdl_family != AF_LINK)
                        continue;
                if (strncmp(ifap->ifa_name, ifname, IFNAMSIZ) != 0)
                        continue;
                mac = (uint8_t *)LLADDR(sdl);
                sprintf(buf, "%02x:%02x:%02x:%02x:%02x:%02x",
                        mac[0], mac[1], mac[2],
                        mac[3], mac[4], mac[5]);
                if (verbose)
                        D("source hwaddr %s", buf);
                break;
        }
        freeifaddrs(ifaphead);
        return ifap ? 0 : 1;
}


/* set the thread affinity. */
static int
setaffinity(pthread_t me, int i)
{
        cpuset_t cpumask;

        if (i == -1)
                return 0;

        /* Set thread affinity affinity.*/
        CPU_ZERO(&cpumask);
        CPU_SET(i, &cpumask);

        if (pthread_setaffinity_np(me, sizeof(cpuset_t), &cpumask) != 0) {
                D("Unable to set affinity: %s", strerror(errno));
                return 1;
        }
        return 0;
}


/* Compute the checksum of the given ip header. */
static uint32_t
checksum(const void *data, uint16_t len, uint32_t sum)
{
        const uint8_t *addr = data;
        uint32_t i;

        /* Checksum all the pairs of bytes first... */
        for (i = 0; i < (len & ~1U); i += 2) {
                sum += (uint16_t)ntohs(*((const uint16_t *)(addr + i)));
                if (sum > 0xFFFF)
                        sum -= 0xFFFF;
        }
        /*
         * If there's a single byte left over, checksum it, too.
         * Network byte order is big-endian, so the remaining byte is
         * the high byte.
         */
        if (i < len) {
                sum += addr[i] << 8;
                if (sum > 0xFFFF)
                        sum -= 0xFFFF;
        }
        return sum;
}

static uint16_t
wrapsum(uint32_t sum)
{
        sum = ~sum & 0xFFFF;
        return (htons(sum));
}

/* Check the payload of the packet for errors (use it for debug).
 * Look for consecutive ascii representations of the size of the packet.
 */
static void
dump_payload(const char *_p, int len, struct netmap_ring *ring, int cur)
{
        char buf[128];
        int i, j, i0;
        const unsigned char *p = (const unsigned char *)_p;

        /* get the length in ASCII of the length of the packet. */

        printf("ring %p cur %5d [buf %6d flags 0x%04x len %5d]\n",
                ring, cur, ring->slot[cur].buf_idx,
                ring->slot[cur].flags, len);
        /* hexdump routine */
        for (i = 0; i < len; ) {
                memset(buf, ' ', sizeof(buf));
                sprintf(buf, "%5d: ", i);
                i0 = i;
                for (j=0; j < 16 && i < len; i++, j++)
                        sprintf(buf+7+j*3, "%02x ", (uint8_t)(p[i]));
                i = i0;
                for (j=0; j < 16 && i < len; i++, j++)
                        sprintf(buf+7+j + 48, "%c",
                                isprint(p[i]) ? p[i] : '.');
                printf("%s\n", buf);
        }
}

/*
 * Fill a packet with some payload.
 * We create a UDP packet so the payload starts at
 *      14+20+8 = 42 bytes.
 */
#ifdef __linux__
#define uh_sport source
#define uh_dport dest
#define uh_ulen len
#define uh_sum check
#endif /* linux */

static uint16_t
new_ip_sum(uint16_t ip_sum, uint32_t oaddr, uint32_t naddr)
{
        ip_sum = cksum_add(ip_sum, ~oaddr >> 16);
        ip_sum = cksum_add(ip_sum, ~oaddr & 0xffff);
        ip_sum = cksum_add(ip_sum, naddr >> 16);
        ip_sum = cksum_add(ip_sum, naddr & 0xffff);
        return ip_sum;
}

static uint16_t
new_udp_sum(uint16_t udp_sum, uint16_t oport, uint16_t nport)
{
        udp_sum = cksum_add(udp_sum, ~oport);
        udp_sum = cksum_add(udp_sum, nport);
        return udp_sum;
}


static void
update_ip(struct pkt *pkt, struct targ *t)
{
        struct glob_arg *g = t->g;
        struct ip ip;
        struct udphdr udp;
        uint32_t oaddr, naddr;
        uint16_t oport, nport;
        uint16_t ip_sum = 0, udp_sum = 0;

        memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
        memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));
        do {
                ip_sum = udp_sum = 0;
                naddr = oaddr = ntohl(ip.ip_src.s_addr);
                nport = oport = ntohs(udp.uh_sport);
                if (g->options & OPT_RANDOM_SRC) {
                        ip.ip_src.s_addr = nrand48(t->seed);
                        udp.uh_sport = nrand48(t->seed);
                        naddr = ntohl(ip.ip_src.s_addr);
                        nport = ntohs(udp.uh_sport);
                        ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
                        udp_sum = new_udp_sum(udp_sum, oport, nport);
                } else {
                        if (oport < g->src_ip.port1) {
                                nport = oport + 1;
                                udp.uh_sport = htons(nport);
                                udp_sum = new_udp_sum(udp_sum, oport, nport);
                                break;
                        }
                        nport = g->src_ip.port0;
                        udp.uh_sport = htons(nport);
                        if (oaddr < g->src_ip.ipv4.end) {
                                naddr = oaddr + 1;
                                ip.ip_src.s_addr = htonl(naddr);
                                ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
                                break;
                        }
                        naddr = g->src_ip.ipv4.start;
                        ip.ip_src.s_addr = htonl(naddr);
                        ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
                }

                naddr = oaddr = ntohl(ip.ip_dst.s_addr);
                nport = oport = ntohs(udp.uh_dport);
                if (g->options & OPT_RANDOM_DST) {
                        ip.ip_dst.s_addr = nrand48(t->seed);
                        udp.uh_dport = nrand48(t->seed);
                        naddr = ntohl(ip.ip_dst.s_addr);
                        nport = ntohs(udp.uh_dport);
                        ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
                        udp_sum = new_udp_sum(udp_sum, oport, nport);
                } else {
                        if (oport < g->dst_ip.port1) {
                                nport = oport + 1;
                                udp.uh_dport = htons(nport);
                                udp_sum = new_udp_sum(udp_sum, oport, nport);
                                break;
                        }
                        nport = g->dst_ip.port0;
                        udp.uh_dport = htons(nport);
                        if (oaddr < g->dst_ip.ipv4.end) {
                                naddr = oaddr + 1;
                                ip.ip_dst.s_addr = htonl(naddr);
                                ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
                                break;
                        }
                        naddr = g->dst_ip.ipv4.start;
                        ip.ip_dst.s_addr = htonl(naddr);
                        ip_sum = new_ip_sum(ip_sum, oaddr, naddr);
                }
        } while (0);
        /* update checksums */
        if (udp_sum != 0)
                udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(udp_sum));
        if (ip_sum != 0) {
                ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));
                udp.uh_sum = ~cksum_add(~udp.uh_sum, htons(ip_sum));
        }
        memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
        memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
}

#ifndef s6_addr16
#define s6_addr16       __u6_addr.__u6_addr16
#endif
static void
update_ip6(struct pkt *pkt, struct targ *t)
{
        struct glob_arg *g = t->g;
        struct ip6_hdr ip6;
        struct udphdr udp;
        uint16_t udp_sum;
        uint16_t oaddr, naddr;
        uint16_t oport, nport;
        uint8_t group;

        memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
        memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));
        do {
                udp_sum = 0;
                group = g->src_ip.ipv6.sgroup;
                naddr = oaddr = ntohs(ip6.ip6_src.s6_addr16[group]);
                nport = oport = ntohs(udp.uh_sport);
                if (g->options & OPT_RANDOM_SRC) {
                        ip6.ip6_src.s6_addr16[group] = nrand48(t->seed);
                        udp.uh_sport = nrand48(t->seed);
                        naddr = ntohs(ip6.ip6_src.s6_addr16[group]);
                        nport = ntohs(udp.uh_sport);
                        break;
                }
                if (oport < g->src_ip.port1) {
                        nport = oport + 1;
                        udp.uh_sport = htons(nport);
                        break;
                }
                nport = g->src_ip.port0;
                udp.uh_sport = htons(nport);
                if (oaddr < ntohs(g->src_ip.ipv6.end.s6_addr16[group])) {
                        naddr = oaddr + 1;
                        ip6.ip6_src.s6_addr16[group] = htons(naddr);
                        break;
                }
                naddr = ntohs(g->src_ip.ipv6.start.s6_addr16[group]);
                ip6.ip6_src.s6_addr16[group] = htons(naddr);

                /* update checksums if needed */
                if (oaddr != naddr)
                        udp_sum = cksum_add(~oaddr, naddr);
                if (oport != nport)
                        udp_sum = cksum_add(udp_sum,
                            cksum_add(~oport, nport));

                group = g->dst_ip.ipv6.egroup;
                naddr = oaddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
                nport = oport = ntohs(udp.uh_dport);
                if (g->options & OPT_RANDOM_DST) {
                        ip6.ip6_dst.s6_addr16[group] = nrand48(t->seed);
                        udp.uh_dport = nrand48(t->seed);
                        naddr = ntohs(ip6.ip6_dst.s6_addr16[group]);
                        nport = ntohs(udp.uh_dport);
                        break;
                }
                if (oport < g->dst_ip.port1) {
                        nport = oport + 1;
                        udp.uh_dport = htons(nport);
                        break;
                }
                nport = g->dst_ip.port0;
                udp.uh_dport = htons(nport);
                if (oaddr < ntohs(g->dst_ip.ipv6.end.s6_addr16[group])) {
                        naddr = oaddr + 1;
                        ip6.ip6_dst.s6_addr16[group] = htons(naddr);
                        break;
                }
                naddr = ntohs(g->dst_ip.ipv6.start.s6_addr16[group]);
                ip6.ip6_dst.s6_addr16[group] = htons(naddr);
        } while (0);
        /* update checksums */
        if (oaddr != naddr)
                udp_sum = cksum_add(udp_sum,
                    cksum_add(~oaddr, naddr));
        if (oport != nport)
                udp_sum = cksum_add(udp_sum,
                    cksum_add(~oport, nport));
        if (udp_sum != 0)
                udp.uh_sum = ~cksum_add(~udp.uh_sum, udp_sum);
        memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
        memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
}

static void
update_addresses(struct pkt *pkt, struct targ *t)
{

        if (t->g->af == AF_INET)
                update_ip(pkt, t);
        else
                update_ip6(pkt, t);
}

static void
update_ip_size(struct pkt *pkt, int size)
{
        struct ip ip;
        struct udphdr udp;
        uint16_t oiplen, niplen;
        uint16_t nudplen;
        uint16_t ip_sum = 0;

        memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
        memcpy(&udp, &pkt->ipv4.udp, sizeof(udp));

        oiplen = ntohs(ip.ip_len);
        niplen = size - sizeof(struct ether_header);
        ip.ip_len = htons(niplen);
        nudplen = niplen - sizeof(struct ip);
        udp.uh_ulen = htons(nudplen);
        ip_sum = new_udp_sum(ip_sum, oiplen, niplen);

        /* update checksums */
        if (ip_sum != 0)
                ip.ip_sum = ~cksum_add(~ip.ip_sum, htons(ip_sum));

        udp.uh_sum = 0;
        /* Magic: taken from sbin/dhclient/packet.c */
        udp.uh_sum = wrapsum(
                checksum(&udp, sizeof(udp),     /* udp header */
                checksum(pkt->ipv4.body,        /* udp payload */
                nudplen - sizeof(udp),
                checksum(&ip.ip_src, /* pseudo header */
                2 * sizeof(ip.ip_src),
                IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));

        memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
        memcpy(&pkt->ipv4.udp, &udp, sizeof(udp));
}

static void
update_ip6_size(struct pkt *pkt, int size)
{
        struct ip6_hdr ip6;
        struct udphdr udp;
        uint16_t niplen, nudplen;
        uint32_t csum;

        memcpy(&ip6, &pkt->ipv6.ip, sizeof(ip6));
        memcpy(&udp, &pkt->ipv6.udp, sizeof(udp));

        nudplen = niplen = size - sizeof(struct ether_header) - sizeof(ip6);
        ip6.ip6_plen = htons(niplen);
        udp.uh_ulen = htons(nudplen);

        /* Save part of pseudo header checksum into csum */
        udp.uh_sum = 0;
        csum = IPPROTO_UDP << 24;
        csum = checksum(&csum, sizeof(csum), nudplen);
        udp.uh_sum = wrapsum(
                checksum(&udp, sizeof(udp),     /* udp header */
                checksum(pkt->ipv6.body,        /* udp payload */
                nudplen - sizeof(udp),
                checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
                2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));

        memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
        memcpy(&pkt->ipv6.udp, &udp, sizeof(udp));
}

static void
update_size(struct pkt *pkt, struct targ *t, int size)
{
        if (t->g->options & OPT_UPDATE_CSUM) {
                if (t->g->af == AF_INET)
                        update_ip_size(pkt, size);
                else
                        update_ip6_size(pkt, size);
        }
}

/*
 * initialize one packet and prepare for the next one.
 * The copy could be done better instead of repeating it each time.
 */
static void
initialize_packet(struct targ *targ)
{
        struct pkt *pkt = &targ->pkt;
        struct ether_header *eh;
        struct ip6_hdr ip6;
        struct ip ip;
        struct udphdr udp;
        void *udp_ptr;
        uint16_t paylen;
        uint32_t csum = 0;
        const char *payload = targ->g->options & OPT_INDIRECT ?
                indirect_payload : default_payload;
        int i, l0 = strlen(payload);

#ifndef NO_PCAP
        char errbuf[PCAP_ERRBUF_SIZE];
        pcap_t *file;
        struct pcap_pkthdr *header;
        const unsigned char *packet;

        /* Read a packet from a PCAP file if asked. */
        if (targ->g->packet_file != NULL) {
                if ((file = pcap_open_offline(targ->g->packet_file,
                            errbuf)) == NULL)
                        D("failed to open pcap file %s",
                            targ->g->packet_file);
                if (pcap_next_ex(file, &header, &packet) < 0)
                        D("failed to read packet from %s",
                            targ->g->packet_file);
                if ((targ->frame = malloc(header->caplen)) == NULL)
                        D("out of memory");
                bcopy(packet, (unsigned char *)targ->frame, header->caplen);
                targ->g->pkt_size = header->caplen;
                pcap_close(file);
                return;
        }
#endif

        paylen = targ->g->pkt_size - sizeof(*eh) -
            (targ->g->af == AF_INET ? sizeof(ip): sizeof(ip6));

        /* create a nice NUL-terminated string */
        for (i = 0; i < paylen; i += l0) {
                if (l0 > paylen - i)
                        l0 = paylen - i; // last round
                bcopy(payload, PKT(pkt, body, targ->g->af) + i, l0);
        }
        PKT(pkt, body, targ->g->af)[i - 1] = '\0';

        /* prepare the headers */
        eh = &pkt->eh;
        bcopy(&targ->g->src_mac.start, eh->ether_shost, 6);
        bcopy(&targ->g->dst_mac.start, eh->ether_dhost, 6);

        if (targ->g->af == AF_INET) {
                eh->ether_type = htons(ETHERTYPE_IP);
                memcpy(&ip, &pkt->ipv4.ip, sizeof(ip));
                udp_ptr = &pkt->ipv4.udp;
                ip.ip_v = IPVERSION;
                ip.ip_hl = sizeof(ip) >> 2;
                ip.ip_id = 0;
                ip.ip_tos = IPTOS_LOWDELAY;
                ip.ip_len = htons(targ->g->pkt_size - sizeof(*eh));
                ip.ip_id = 0;
                ip.ip_off = htons(IP_DF); /* Don't fragment */
                ip.ip_ttl = IPDEFTTL;
                ip.ip_p = IPPROTO_UDP;
                ip.ip_dst.s_addr = htonl(targ->g->dst_ip.ipv4.start);
                ip.ip_src.s_addr = htonl(targ->g->src_ip.ipv4.start);
                ip.ip_sum = wrapsum(checksum(&ip, sizeof(ip), 0));
                memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
        } else {
                eh->ether_type = htons(ETHERTYPE_IPV6);
                memcpy(&ip6, &pkt->ipv4.ip, sizeof(ip6));
                udp_ptr = &pkt->ipv6.udp;
                ip6.ip6_flow = 0;
                ip6.ip6_plen = htons(paylen);
                ip6.ip6_vfc = IPV6_VERSION;
                ip6.ip6_nxt = IPPROTO_UDP;
                ip6.ip6_hlim = IPV6_DEFHLIM;
                ip6.ip6_src = targ->g->src_ip.ipv6.start;
                ip6.ip6_dst = targ->g->dst_ip.ipv6.start;
        }
        memcpy(&udp, udp_ptr, sizeof(udp));

        udp.uh_sport = htons(targ->g->src_ip.port0);
        udp.uh_dport = htons(targ->g->dst_ip.port0);
        udp.uh_ulen = htons(paylen);
        if (targ->g->af == AF_INET) {
                /* Magic: taken from sbin/dhclient/packet.c */
                udp.uh_sum = wrapsum(
                    checksum(&udp, sizeof(udp), /* udp header */
                    checksum(pkt->ipv4.body,    /* udp payload */
                    paylen - sizeof(udp),
                    checksum(&pkt->ipv4.ip.ip_src, /* pseudo header */
                        2 * sizeof(pkt->ipv4.ip.ip_src),
                        IPPROTO_UDP + (u_int32_t)ntohs(udp.uh_ulen)))));
                memcpy(&pkt->ipv4.ip, &ip, sizeof(ip));
        } else {
                /* Save part of pseudo header checksum into csum */
                csum = IPPROTO_UDP << 24;
                csum = checksum(&csum, sizeof(csum), paylen);
                udp.uh_sum = wrapsum(
                    checksum(udp_ptr, sizeof(udp),      /* udp header */
                    checksum(pkt->ipv6.body,    /* udp payload */
                    paylen - sizeof(udp),
                    checksum(&pkt->ipv6.ip.ip6_src, /* pseudo header */
                        2 * sizeof(pkt->ipv6.ip.ip6_src), csum))));
                memcpy(&pkt->ipv6.ip, &ip6, sizeof(ip6));
        }
        memcpy(udp_ptr, &udp, sizeof(udp));

        bzero(&pkt->vh, sizeof(pkt->vh));
        // dump_payload((void *)pkt, targ->g->pkt_size, NULL, 0);
}

static void
get_vnet_hdr_len(struct glob_arg *g)
{
        struct nmreq_header hdr;
        struct nmreq_port_hdr ph;
        int err;

        hdr = g->nmd->hdr; /* copy name and version */
        hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_GET;
        hdr.nr_options = 0;
        memset(&ph, 0, sizeof(ph));
        hdr.nr_body = (uintptr_t)&ph;
        err = ioctl(g->main_fd, NIOCCTRL, &hdr);
        if (err) {
                D("Unable to get virtio-net header length");
                return;
        }

        g->virt_header = ph.nr_hdr_len;
        if (g->virt_header) {
                D("Port requires virtio-net header, length = %d",
                  g->virt_header);
        }
}

static void
set_vnet_hdr_len(struct glob_arg *g)
{
        int err, l = g->virt_header;
        struct nmreq_header hdr;
        struct nmreq_port_hdr ph;

        if (l == 0)
                return;

        hdr = g->nmd->hdr; /* copy name and version */
        hdr.nr_reqtype = NETMAP_REQ_PORT_HDR_SET;
        hdr.nr_options = 0;
        memset(&ph, 0, sizeof(ph));
        hdr.nr_body = (uintptr_t)&ph;
        err = ioctl(g->main_fd, NIOCCTRL, &hdr);
        if (err) {
                D("Unable to set virtio-net header length %d", l);
        }
}

/*
 * create and enqueue a batch of packets on a ring.
 * On the last one set NS_REPORT to tell the driver to generate
 * an interrupt when done.
 */
static int
send_packets(struct netmap_ring *ring, struct pkt *pkt, void *frame,
                int size, struct targ *t, u_int count, int options)
{
        u_int n, sent, head = ring->head;
        u_int frags = t->frags;
        u_int frag_size = t->frag_size;
        struct netmap_slot *slot = &ring->slot[head];

        n = nm_ring_space(ring);
#if 0
        if (options & (OPT_COPY | OPT_PREFETCH) ) {
                for (sent = 0; sent < count; sent++) {
                        struct netmap_slot *slot = &ring->slot[head];
                        char *p = NETMAP_BUF(ring, slot->buf_idx);

                        __builtin_prefetch(p);
                        head = nm_ring_next(ring, head);
                }
                head = ring->head;
        }
#endif
        for (sent = 0; sent < count && n >= frags; sent++, n--) {
                char *p;
                int buf_changed;
                u_int tosend = size;

                slot = &ring->slot[head];
                p = NETMAP_BUF(ring, slot->buf_idx);
                buf_changed = slot->flags & NS_BUF_CHANGED;

                slot->flags = 0;
                if (options & OPT_RUBBISH) {
                        /* do nothing */
                } else if (options & OPT_INDIRECT) {
                        slot->flags |= NS_INDIRECT;
                        slot->ptr = (uint64_t)((uintptr_t)frame);
                } else if (frags > 1) {
                        u_int i;
                        const char *f = frame;
                        char *fp = p;
                        for (i = 0; i < frags - 1; i++) {
                                memcpy(fp, f, frag_size);
                                slot->len = frag_size;
                                slot->flags = NS_MOREFRAG;
                                if (options & OPT_DUMP)
                                        dump_payload(fp, frag_size, ring, head);
                                tosend -= frag_size;
                                f += frag_size;
                                head = nm_ring_next(ring, head);
                                slot = &ring->slot[head];
                                fp = NETMAP_BUF(ring, slot->buf_idx);
                        }
                        n -= (frags - 1);
                        p = fp;
                        slot->flags = 0;
                        memcpy(p, f, tosend);
                        update_addresses(pkt, t);
                } else if ((options & (OPT_COPY | OPT_MEMCPY)) || buf_changed) {
                        if (options & OPT_COPY)
                                nm_pkt_copy(frame, p, size);
                        else
                                memcpy(p, frame, size);
                        update_addresses(pkt, t);
                } else if (options & OPT_PREFETCH) {
                        __builtin_prefetch(p);
                }
                slot->len = tosend;
                if (options & OPT_DUMP)
                        dump_payload(p, tosend, ring, head);
                head = nm_ring_next(ring, head);
        }
        if (sent) {
                slot->flags |= NS_REPORT;
                ring->head = ring->cur = head;
        }
        if (sent < count) {
                /* tell netmap that we need more slots */
                ring->cur = ring->tail;
        }

        return (sent);
}

/*
 * Index of the highest bit set
 */
static uint32_t
msb64(uint64_t x)
{
        uint64_t m = 1ULL << 63;
        int i;

        for (i = 63; i >= 0; i--, m >>=1)
                if (m & x)
                        return i;
        return 0;
}

/*
 * wait until ts, either busy or sleeping if more than 1ms.
 * Return wakeup time.
 */
static struct timespec
wait_time(struct timespec ts)
{
        for (;;) {
                struct timespec w, cur;
                clock_gettime(CLOCK_REALTIME_PRECISE, &cur);
                w = timespec_sub(ts, cur);
                if (w.tv_sec < 0)
                        return cur;
                else if (w.tv_sec > 0 || w.tv_nsec > 1000000)
                        poll(NULL, 0, 1);
        }
}

/*
 * Send a packet, and wait for a response.
 * The payload (after UDP header, ofs 42) has a 4-byte sequence
 * followed by a struct timeval (or bintime?)
 */

static void *
ping_body(void *data)
{
        struct targ *targ = (struct targ *) data;
        struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
        struct netmap_if *nifp = targ->nmd->nifp;
        int i, m;
        void *frame;
        int size;
        struct timespec ts, now, last_print;
        struct timespec nexttime = {0, 0}; /* silence compiler */
        uint64_t sent = 0, n = targ->g->npackets;
        uint64_t count = 0, t_cur, t_min = ~0, av = 0;
        uint64_t g_min = ~0, g_av = 0;
        uint64_t buckets[64];   /* bins for delays, ns */
        int rate_limit = targ->g->tx_rate, tosend = 0;

        frame = (char*)&targ->pkt + sizeof(targ->pkt.vh) - targ->g->virt_header;
        size = targ->g->pkt_size + targ->g->virt_header;


        if (targ->g->nthreads > 1) {
                D("can only ping with 1 thread");
                return NULL;
        }

        if (targ->g->af == AF_INET6) {
                D("Warning: ping-pong with IPv6 not supported");
        }

        bzero(&buckets, sizeof(buckets));
        clock_gettime(CLOCK_REALTIME_PRECISE, &last_print);
        now = last_print;
        if (rate_limit) {
                targ->tic = timespec_add(now, (struct timespec){2,0});
                targ->tic.tv_nsec = 0;
                wait_time(targ->tic);
                nexttime = targ->tic;
        }
        while (!targ->cancel && (n == 0 || sent < n)) {
                struct netmap_ring *ring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
                struct netmap_slot *slot;
                char *p;
                int rv;
                uint64_t limit, event = 0;

                if (rate_limit && tosend <= 0) {
                        tosend = targ->g->burst;
                        nexttime = timespec_add(nexttime, targ->g->tx_period);
                        wait_time(nexttime);
                }

                limit = rate_limit ? tosend : targ->g->burst;
                if (n > 0 && n - sent < limit)
                        limit = n - sent;
                for (m = 0; (unsigned)m < limit; m++) {
                        slot = &ring->slot[ring->head];
                        slot->len = size;
                        p = NETMAP_BUF(ring, slot->buf_idx);

                        if (nm_ring_empty(ring)) {
                                D("-- ouch, cannot send");
                                break;
                        } else {
                                struct tstamp *tp;
                                nm_pkt_copy(frame, p, size);
                                clock_gettime(CLOCK_REALTIME_PRECISE, &ts);
                                bcopy(&sent, p+42, sizeof(sent));
                                tp = (struct tstamp *)(p+46);
                                tp->sec = (uint32_t)ts.tv_sec;
                                tp->nsec = (uint32_t)ts.tv_nsec;
                                sent++;
                                ring->head = ring->cur = nm_ring_next(ring, ring->head);
                        }
                }
                if (m > 0)
                        event++;
                targ->ctr.pkts = sent;
                targ->ctr.bytes = sent*size;
                targ->ctr.events = event;
                if (rate_limit)
                        tosend -= m;
#ifdef BUSYWAIT
                rv = ioctl(pfd.fd, NIOCTXSYNC, NULL);
                if (rv < 0) {
                        D("TXSYNC error on queue %d: %s", targ->me,
                                strerror(errno));
                }
        again:
                ioctl(pfd.fd, NIOCRXSYNC, NULL);
#else
                /* should use a parameter to decide how often to send */
                if ( (rv = poll(&pfd, 1, 3000)) <= 0) {
                        D("poll error on queue %d: %s", targ->me,
                                (rv ? strerror(errno) : "timeout"));
                        continue;
                }
#endif /* BUSYWAIT */
                /* see what we got back */
#ifdef BUSYWAIT
                int rx = 0;
#endif
                for (i = targ->nmd->first_rx_ring;
                        i <= targ->nmd->last_rx_ring; i++) {
                        ring = NETMAP_RXRING(nifp, i);
                        while (!nm_ring_empty(ring)) {
                                uint32_t seq;
                                struct tstamp *tp;
                                int pos;

                                slot = &ring->slot[ring->head];
                                p = NETMAP_BUF(ring, slot->buf_idx);

                                clock_gettime(CLOCK_REALTIME_PRECISE, &now);
                                bcopy(p+42, &seq, sizeof(seq));
                                tp = (struct tstamp *)(p+46);
                                ts.tv_sec = (time_t)tp->sec;
                                ts.tv_nsec = (long)tp->nsec;
                                ts.tv_sec = now.tv_sec - ts.tv_sec;
                                ts.tv_nsec = now.tv_nsec - ts.tv_nsec;
                                if (ts.tv_nsec < 0) {
                                        ts.tv_nsec += 1000000000;
                                        ts.tv_sec--;
                                }
                                if (0) D("seq %d/%llu delta %d.%09d", seq,
                                        (unsigned long long)sent,
                                        (int)ts.tv_sec, (int)ts.tv_nsec);
                                t_cur = ts.tv_sec * 1000000000UL + ts.tv_nsec;
                                if (t_cur < t_min)
                                        t_min = t_cur;
                                count ++;
                                av += t_cur;
                                pos = msb64(t_cur);
                                buckets[pos]++;
                                /* now store it in a bucket */
                                ring->head = ring->cur = nm_ring_next(ring, ring->head);
#ifdef BUSYWAIT
                                rx++;
#endif
                        }
                }
                //D("tx %d rx %d", sent, rx);
                //usleep(100000);
                ts.tv_sec = now.tv_sec - last_print.tv_sec;
                ts.tv_nsec = now.tv_nsec - last_print.tv_nsec;
                if (ts.tv_nsec < 0) {
                        ts.tv_nsec += 1000000000;
                        ts.tv_sec--;
                }
                if (ts.tv_sec >= 1) {
                        D("count %d RTT: min %d av %d ns",
                                (int)count, (int)t_min, (int)(av/count));
                        int k, j, kmin, off;
                        char buf[512];

                        for (kmin = 0; kmin < 64; kmin ++)
                                if (buckets[kmin])
                                        break;
                        for (k = 63; k >= kmin; k--)
                                if (buckets[k])
                                        break;
                        buf[0] = '\0';
                        off = 0;
                        for (j = kmin; j <= k; j++) {
                                off += sprintf(buf + off, " %5d", (int)buckets[j]);
                        }
                        D("k: %d .. %d\n\t%s", 1<<kmin, 1<<k, buf);
                        bzero(&buckets, sizeof(buckets));
                        count = 0;
                        g_av += av;
                        av = 0;
                        if (t_min < g_min)
                                g_min = t_min;
                        t_min = ~0;
                        last_print = now;
                }
#ifdef BUSYWAIT
                if (rx < m && ts.tv_sec <= 3 && !targ->cancel)
                        goto again;
#endif /* BUSYWAIT */
        }

        if (sent > 0) {
                D("RTT over %llu packets: min %d av %d ns",
                        (long long unsigned)sent, (int)g_min,
                        (int)((double)g_av/sent));
        }
        targ->completed = 1;

        /* reset the ``used`` flag. */
        targ->used = 0;

        return NULL;
}


/*
 * reply to ping requests
 */
static void *
pong_body(void *data)
{
        struct targ *targ = (struct targ *) data;
        struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
        struct netmap_if *nifp = targ->nmd->nifp;
        struct netmap_ring *txring, *rxring;
        int i;
        uint64_t sent = 0, n = targ->g->npackets;

        if (targ->g->nthreads > 1) {
                D("can only reply ping with 1 thread");
                return NULL;
        }
        if (n > 0)
                D("understood ponger %llu but don't know how to do it",
                        (unsigned long long)n);

        if (targ->g->af == AF_INET6) {
                D("Warning: ping-pong with IPv6 not supported");
        }

        while (!targ->cancel && (n == 0 || sent < n)) {
                uint32_t txhead, txavail;
//#define BUSYWAIT
#ifdef BUSYWAIT
                ioctl(pfd.fd, NIOCRXSYNC, NULL);
#else
                int rv;
                if ( (rv = poll(&pfd, 1, 1000)) <= 0) {
                        D("poll error on queue %d: %s", targ->me,
                                rv ? strerror(errno) : "timeout");
                        continue;
                }
#endif
                txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
                txhead = txring->head;
                txavail = nm_ring_space(txring);
                /* see what we got back */
                for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
                        rxring = NETMAP_RXRING(nifp, i);
                        while (!nm_ring_empty(rxring)) {
                                uint16_t *spkt, *dpkt;
                                uint32_t head = rxring->head;
                                struct netmap_slot *slot = &rxring->slot[head];
                                char *src, *dst;
                                src = NETMAP_BUF(rxring, slot->buf_idx);
                                //D("got pkt %p of size %d", src, slot->len);
                                rxring->head = rxring->cur = nm_ring_next(rxring, head);
                                if (txavail == 0)
                                        continue;
                                dst = NETMAP_BUF(txring,
                                    txring->slot[txhead].buf_idx);
                                /* copy... */
                                dpkt = (uint16_t *)dst;
                                spkt = (uint16_t *)src;
                                nm_pkt_copy(src, dst, slot->len);
                                /* swap source and destination MAC */
                                dpkt[0] = spkt[3];
                                dpkt[1] = spkt[4];
                                dpkt[2] = spkt[5];
                                dpkt[3] = spkt[0];
                                dpkt[4] = spkt[1];
                                dpkt[5] = spkt[2];
                                /* swap source and destination IPv4 */
                                if (spkt[6] == htons(ETHERTYPE_IP)) {
                                        dpkt[13] = spkt[15];
                                        dpkt[14] = spkt[16];
                                        dpkt[15] = spkt[13];
                                        dpkt[16] = spkt[14];
                                }
                                txring->slot[txhead].len = slot->len;
                                //dump_payload(dst, slot->len, txring, txhead);
                                txhead = nm_ring_next(txring, txhead);
                                txavail--;
                                sent++;
                        }
                }
                txring->head = txring->cur = txhead;
                targ->ctr.pkts = sent;
#ifdef BUSYWAIT
                ioctl(pfd.fd, NIOCTXSYNC, NULL);
#endif
        }

        targ->completed = 1;

        /* reset the ``used`` flag. */
        targ->used = 0;

        return NULL;
}


static void *
sender_body(void *data)
{
        struct targ *targ = (struct targ *) data;
        struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
        struct netmap_if *nifp;
        struct netmap_ring *txring = NULL;
        int i;
        uint64_t n = targ->g->npackets / targ->g->nthreads;
        uint64_t sent = 0;
        uint64_t event = 0;
        int options = targ->g->options;
        struct timespec nexttime = { 0, 0}; // XXX silence compiler
        int rate_limit = targ->g->tx_rate;
        struct pkt *pkt = &targ->pkt;
        void *frame;
        int size;

        if (targ->frame == NULL) {
                frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
                size = targ->g->pkt_size + targ->g->virt_header;
        } else {
                frame = targ->frame;
                size = targ->g->pkt_size;
        }

        D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
        if (setaffinity(targ->thread, targ->affinity))
                goto quit;

        /* main loop.*/
        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
        if (rate_limit) {
                targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
                targ->tic.tv_nsec = 0;
                wait_time(targ->tic);
                nexttime = targ->tic;
        }
        if (targ->g->dev_type == DEV_TAP) {
            D("writing to file desc %d", targ->g->main_fd);

            for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
                if (write(targ->g->main_fd, frame, size) != -1)
                        sent++;
                update_addresses(pkt, targ);
                if (i > 10000) {
                        targ->ctr.pkts = sent;
                        targ->ctr.bytes = sent*size;
                        targ->ctr.events = sent;
                        i = 0;
                }
            }
#ifndef NO_PCAP
    } else if (targ->g->dev_type == DEV_PCAP) {
            pcap_t *p = targ->g->p;

            for (i = 0; !targ->cancel && (n == 0 || sent < n); i++) {
                if (pcap_inject(p, frame, size) != -1)
                        sent++;
                update_addresses(pkt, targ);
                if (i > 10000) {
                        targ->ctr.pkts = sent;
                        targ->ctr.bytes = sent*size;
                        targ->ctr.events = sent;
                        i = 0;
                }
            }
#endif /* NO_PCAP */
    } else {
        int tosend = 0;
        u_int bufsz, frag_size = targ->g->frag_size;

        nifp = targ->nmd->nifp;
        txring = NETMAP_TXRING(nifp, targ->nmd->first_tx_ring);
        bufsz = txring->nr_buf_size;
        if (bufsz < frag_size)
                frag_size = bufsz;
        targ->frag_size = targ->g->pkt_size / targ->frags;
        if (targ->frag_size > frag_size) {
                targ->frags = targ->g->pkt_size / frag_size;
                targ->frag_size = frag_size;
                if (targ->g->pkt_size % frag_size != 0)
                        targ->frags++;
        }
        D("frags %u frag_size %u", targ->frags, targ->frag_size);

        /* mark all slots of all rings as changed so initial copy will be done */
        for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
                uint32_t j;
                struct netmap_slot *slot;

                txring = NETMAP_TXRING(nifp, i);
                for (j = 0; j < txring->num_slots; j++) {
                        slot = &txring->slot[j];
                        slot->flags = NS_BUF_CHANGED;
                }
        }

        while (!targ->cancel && (n == 0 || sent < n)) {
                int rv;

                if (rate_limit && tosend <= 0) {
                        tosend = targ->g->burst;
                        nexttime = timespec_add(nexttime, targ->g->tx_period);
                        wait_time(nexttime);
                }

                /*
                 * wait for available room in the send queue(s)
                 */
#ifdef BUSYWAIT
                (void)rv;
                if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
                        D("ioctl error on queue %d: %s", targ->me,
                                        strerror(errno));
                        goto quit;
                }
#else /* !BUSYWAIT */
                if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
                        if (targ->cancel)
                                break;
                        D("poll error on queue %d: %s", targ->me,
                                rv ? strerror(errno) : "timeout");
                        // goto quit;
                }
                if (pfd.revents & POLLERR) {
                        D("poll error on %d ring %d-%d", pfd.fd,
                                targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
                        goto quit;
                }
#endif /* !BUSYWAIT */
                /*
                 * scan our queues and send on those with room
                 */
                for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
                        int m;
                        uint64_t limit = rate_limit ?  tosend : targ->g->burst;

                        if (n > 0 && n == sent)
                                break;

                        if (n > 0 && n - sent < limit)
                                limit = n - sent;
                        txring = NETMAP_TXRING(nifp, i);
                        if (nm_ring_empty(txring))
                                continue;

                        if (targ->g->pkt_min_size > 0) {
                                size = nrand48(targ->seed) %
                                        (targ->g->pkt_size - targ->g->pkt_min_size) +
                                        targ->g->pkt_min_size;
                                update_size(pkt, targ, size);
                        }
                        m = send_packets(txring, pkt, frame, size, targ,
                                         limit, options);
                        ND("limit %lu tail %d m %d",
                                limit, txring->tail, m);
                        sent += m;
                        if (m > 0) //XXX-ste: can m be 0?
                                event++;
                        targ->ctr.pkts = sent;
                        targ->ctr.bytes += m*size;
                        targ->ctr.events = event;
                        if (rate_limit) {
                                tosend -= m;
                                if (tosend <= 0)
                                        break;
                        }
                }
        }
        /* flush any remaining packets */
        if (txring != NULL) {
                D("flush tail %d head %d on thread %p",
                        txring->tail, txring->head,
                        (void *)pthread_self());
                ioctl(pfd.fd, NIOCTXSYNC, NULL);
        }

        /* final part: wait all the TX queues to be empty. */
        for (i = targ->nmd->first_tx_ring; i <= targ->nmd->last_tx_ring; i++) {
                txring = NETMAP_TXRING(nifp, i);
                while (!targ->cancel && nm_tx_pending(txring)) {
                        RD(5, "pending tx tail %d head %d on ring %d",
                                txring->tail, txring->head, i);
                        ioctl(pfd.fd, NIOCTXSYNC, NULL);
                        usleep(1); /* wait 1 tick */
                }
        }
    } /* end DEV_NETMAP */

        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
        targ->completed = 1;
        targ->ctr.pkts = sent;
        targ->ctr.bytes = sent*size;
        targ->ctr.events = event;
quit:
        /* reset the ``used`` flag. */
        targ->used = 0;

        return (NULL);
}


#ifndef NO_PCAP
static void
receive_pcap(u_char *user, const struct pcap_pkthdr * h,
        const u_char * bytes)
{
        struct my_ctrs *ctr = (struct my_ctrs *)user;
        (void)bytes;    /* UNUSED */
        ctr->bytes += h->len;
        ctr->pkts++;
}
#endif /* !NO_PCAP */


static int
receive_packets(struct netmap_ring *ring, u_int limit, int dump, uint64_t *bytes)
{
        u_int head, rx, n;
        uint64_t b = 0;
        u_int complete = 0;

        if (bytes == NULL)
                bytes = &b;

        head = ring->head;
        n = nm_ring_space(ring);
        if (n < limit)
                limit = n;
        for (rx = 0; rx < limit; rx++) {
                struct netmap_slot *slot = &ring->slot[head];
                char *p = NETMAP_BUF(ring, slot->buf_idx);

                *bytes += slot->len;
                if (dump)
                        dump_payload(p, slot->len, ring, head);
                if (!(slot->flags & NS_MOREFRAG))
                        complete++;

                head = nm_ring_next(ring, head);
        }
        ring->head = ring->cur = head;

        return (complete);
}

static void *
receiver_body(void *data)
{
        struct targ *targ = (struct targ *) data;
        struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
        struct netmap_if *nifp;
        struct netmap_ring *rxring;
        int i;
        struct my_ctrs cur;
        uint64_t n = targ->g->npackets / targ->g->nthreads;

        memset(&cur, 0, sizeof(cur));

        if (setaffinity(targ->thread, targ->affinity))
                goto quit;

        D("reading from %s fd %d main_fd %d",
                targ->g->ifname, targ->fd, targ->g->main_fd);
        /* unbounded wait for the first packet. */
        for (;!targ->cancel;) {
                i = poll(&pfd, 1, 1000);
                if (i > 0 && !(pfd.revents & POLLERR))
                        break;
                if (i < 0) {
                        D("poll() error: %s", strerror(errno));
                        goto quit;
                }
                if (pfd.revents & POLLERR) {
                        D("fd error");
                        goto quit;
                }
                RD(1, "waiting for initial packets, poll returns %d %d",
                        i, pfd.revents);
        }
        /* main loop, exit after 1s silence */
        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
    if (targ->g->dev_type == DEV_TAP) {
        while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
                char buf[MAX_BODYSIZE];
                /* XXX should we poll ? */
                i = read(targ->g->main_fd, buf, sizeof(buf));
                if (i > 0) {
                        targ->ctr.pkts++;
                        targ->ctr.bytes += i;
                        targ->ctr.events++;
                }
        }
#ifndef NO_PCAP
    } else if (targ->g->dev_type == DEV_PCAP) {
        while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
                /* XXX should we poll ? */
                pcap_dispatch(targ->g->p, targ->g->burst, receive_pcap,
                        (u_char *)&targ->ctr);
                targ->ctr.events++;
        }
#endif /* !NO_PCAP */
    } else {
        int dump = targ->g->options & OPT_DUMP;

        nifp = targ->nmd->nifp;
        while (!targ->cancel && (n == 0 || targ->ctr.pkts < n)) {
                /* Once we started to receive packets, wait at most 1 seconds
                   before quitting. */
#ifdef BUSYWAIT
                if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
                        D("ioctl error on queue %d: %s", targ->me,
                                        strerror(errno));
                        goto quit;
                }
#else /* !BUSYWAIT */
                if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
                        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
                        targ->toc.tv_sec -= 1; /* Subtract timeout time. */
                        goto out;
                }

                if (pfd.revents & POLLERR) {
                        D("poll err");
                        goto quit;
                }
#endif /* !BUSYWAIT */
                uint64_t cur_space = 0;
                for (i = targ->nmd->first_rx_ring; i <= targ->nmd->last_rx_ring; i++) {
                        int m;

                        rxring = NETMAP_RXRING(nifp, i);
                        /* compute free space in the ring */
                        m = rxring->head + rxring->num_slots - rxring->tail;
                        if (m >= (int) rxring->num_slots)
                                m -= rxring->num_slots;
                        cur_space += m;
                        if (nm_ring_empty(rxring))
                                continue;

                        m = receive_packets(rxring, targ->g->burst, dump, &cur.bytes);
                        cur.pkts += m;
                        if (m > 0)
                                cur.events++;
                }
                cur.min_space = targ->ctr.min_space;
                if (cur_space < cur.min_space)
                        cur.min_space = cur_space;
                targ->ctr = cur;
        }
    }

        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

#if !defined(BUSYWAIT)
out:
#endif
        targ->completed = 1;
        targ->ctr = cur;

quit:
        /* reset the ``used`` flag. */
        targ->used = 0;

        return (NULL);
}

static void *
txseq_body(void *data)
{
        struct targ *targ = (struct targ *) data;
        struct pollfd pfd = { .fd = targ->fd, .events = POLLOUT };
        struct netmap_ring *ring;
        int64_t sent = 0;
        uint64_t event = 0;
        int options = targ->g->options | OPT_COPY;
        struct timespec nexttime = {0, 0};
        int rate_limit = targ->g->tx_rate;
        struct pkt *pkt = &targ->pkt;
        int frags = targ->g->frags;
        uint32_t sequence = 0;
        int budget = 0;
        void *frame;
        int size;

        if (targ->g->nthreads > 1) {
                D("can only txseq ping with 1 thread");
                return NULL;
        }

        if (targ->g->npackets > 0) {
                D("Ignoring -n argument");
        }

        frame = (char *)pkt + sizeof(pkt->vh) - targ->g->virt_header;
        size = targ->g->pkt_size + targ->g->virt_header;

        D("start, fd %d main_fd %d", targ->fd, targ->g->main_fd);
        if (setaffinity(targ->thread, targ->affinity))
                goto quit;

        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);
        if (rate_limit) {
                targ->tic = timespec_add(targ->tic, (struct timespec){2,0});
                targ->tic.tv_nsec = 0;
                wait_time(targ->tic);
                nexttime = targ->tic;
        }

        /* Only use the first queue. */
        ring = NETMAP_TXRING(targ->nmd->nifp, targ->nmd->first_tx_ring);

        while (!targ->cancel) {
                int64_t limit;
                unsigned int space;
                unsigned int head;
                int fcnt;
                uint16_t sum = 0;
                int rv;

                if (!rate_limit) {
                        budget = targ->g->burst;

                } else if (budget <= 0) {
                        budget = targ->g->burst;
                        nexttime = timespec_add(nexttime, targ->g->tx_period);
                        wait_time(nexttime);
                }

                /* wait for available room in the send queue */
#ifdef BUSYWAIT
                (void)rv;
                if (ioctl(pfd.fd, NIOCTXSYNC, NULL) < 0) {
                        D("ioctl error on queue %d: %s", targ->me,
                                        strerror(errno));
                        goto quit;
                }
#else /* !BUSYWAIT */
                if ( (rv = poll(&pfd, 1, 2000)) <= 0) {
                        if (targ->cancel)
                                break;
                        D("poll error on queue %d: %s", targ->me,
                                rv ? strerror(errno) : "timeout");
                        // goto quit;
                }
                if (pfd.revents & POLLERR) {
                        D("poll error on %d ring %d-%d", pfd.fd,
                                targ->nmd->first_tx_ring, targ->nmd->last_tx_ring);
                        goto quit;
                }
#endif /* !BUSYWAIT */

                /* If no room poll() again. */
                space = nm_ring_space(ring);
                if (!space) {
                        continue;
                }

                limit = budget;

                if (space < limit) {
                        limit = space;
                }

                /* Cut off ``limit`` to make sure is multiple of ``frags``. */
                if (frags > 1) {
                        limit = (limit / frags) * frags;
                }

                limit = sent + limit; /* Convert to absolute. */

                for (fcnt = frags, head = ring->head;
                                sent < limit; sent++, sequence++) {
                        struct netmap_slot *slot = &ring->slot[head];
                        char *p = NETMAP_BUF(ring, slot->buf_idx);
                        uint16_t *w = (uint16_t *)PKT(pkt, body, targ->g->af), t;

                        memcpy(&sum, targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, sizeof(sum));

                        slot->flags = 0;
                        t = *w;
                        PKT(pkt, body, targ->g->af)[0] = sequence >> 24;
                        PKT(pkt, body, targ->g->af)[1] = (sequence >> 16) & 0xff;
                        sum = ~cksum_add(~sum, cksum_add(~t, *w));
                        t = *++w;
                        PKT(pkt, body, targ->g->af)[2] = (sequence >> 8) & 0xff;
                        PKT(pkt, body, targ->g->af)[3] = sequence & 0xff;
                        sum = ~cksum_add(~sum, cksum_add(~t, *w));
                        memcpy(targ->g->af == AF_INET ? &pkt->ipv4.udp.uh_sum : &pkt->ipv6.udp.uh_sum, &sum, sizeof(sum));
                        nm_pkt_copy(frame, p, size);
                        if (fcnt == frags) {
                                update_addresses(pkt, targ);
                        }

                        if (options & OPT_DUMP) {
                                dump_payload(p, size, ring, head);
                        }

                        slot->len = size;

                        if (--fcnt > 0) {
                                slot->flags |= NS_MOREFRAG;
                        } else {
                                fcnt = frags;
                        }

                        if (sent == limit - 1) {
                                /* Make sure we don't push an incomplete
                                 * packet. */
                                assert(!(slot->flags & NS_MOREFRAG));
                                slot->flags |= NS_REPORT;
                        }

                        head = nm_ring_next(ring, head);
                        if (rate_limit) {
                                budget--;
                        }
                }

                ring->cur = ring->head = head;

                event ++;
                targ->ctr.pkts = sent;
                targ->ctr.bytes = sent * size;
                targ->ctr.events = event;
        }

        /* flush any remaining packets */
        D("flush tail %d head %d on thread %p",
                ring->tail, ring->head,
                (void *)pthread_self());
        ioctl(pfd.fd, NIOCTXSYNC, NULL);

        /* final part: wait the TX queues to become empty. */
        while (!targ->cancel && nm_tx_pending(ring)) {
                RD(5, "pending tx tail %d head %d on ring %d",
                                ring->tail, ring->head, targ->nmd->first_tx_ring);
                ioctl(pfd.fd, NIOCTXSYNC, NULL);
                usleep(1); /* wait 1 tick */
        }

        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
        targ->completed = 1;
        targ->ctr.pkts = sent;
        targ->ctr.bytes = sent * size;
        targ->ctr.events = event;
quit:
        /* reset the ``used`` flag. */
        targ->used = 0;

        return (NULL);
}


static char *
multi_slot_to_string(struct netmap_ring *ring, unsigned int head,
                     unsigned int nfrags, char *strbuf, size_t strbuflen)
{
        unsigned int f;
        char *ret = strbuf;

        for (f = 0; f < nfrags; f++) {
                struct netmap_slot *slot = &ring->slot[head];
                int m = snprintf(strbuf, strbuflen, "|%u,%x|", slot->len,
                                 slot->flags);
                if (m >= (int)strbuflen) {
                        break;
                }
                strbuf += m;
                strbuflen -= m;

                head = nm_ring_next(ring, head);
        }

        return ret;
}

static void *
rxseq_body(void *data)
{
        struct targ *targ = (struct targ *) data;
        struct pollfd pfd = { .fd = targ->fd, .events = POLLIN };
        int dump = targ->g->options & OPT_DUMP;
        struct netmap_ring *ring;
        unsigned int frags_exp = 1;
        struct my_ctrs cur;
        unsigned int frags = 0;
        int first_packet = 1;
        int first_slot = 1;
        int i, j, af, nrings;
        uint32_t seq, *seq_exp = NULL;

        memset(&cur, 0, sizeof(cur));

        if (setaffinity(targ->thread, targ->affinity))
                goto quit;

        nrings = targ->nmd->last_rx_ring - targ->nmd->first_rx_ring + 1;
        seq_exp = calloc(nrings, sizeof(uint32_t));
        if (seq_exp == NULL) {
                D("failed to allocate seq array");
                goto quit;
        }

        D("reading from %s fd %d main_fd %d",
                targ->g->ifname, targ->fd, targ->g->main_fd);
        /* unbounded wait for the first packet. */
        for (;!targ->cancel;) {
                i = poll(&pfd, 1, 1000);
                if (i > 0 && !(pfd.revents & POLLERR))
                        break;
                RD(1, "waiting for initial packets, poll returns %d %d",
                        i, pfd.revents);
        }

        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->tic);


        while (!targ->cancel) {
                unsigned int head;
                int limit;

#ifdef BUSYWAIT
                if (ioctl(pfd.fd, NIOCRXSYNC, NULL) < 0) {
                        D("ioctl error on queue %d: %s", targ->me,
                                        strerror(errno));
                        goto quit;
                }
#else /* !BUSYWAIT */
                if (poll(&pfd, 1, 1 * 1000) <= 0 && !targ->g->forever) {
                        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);
                        targ->toc.tv_sec -= 1; /* Subtract timeout time. */
                        goto out;
                }

                if (pfd.revents & POLLERR) {
                        D("poll err");
                        goto quit;
                }
#endif /* !BUSYWAIT */

                for (j = targ->nmd->first_rx_ring; j <= targ->nmd->last_rx_ring; j++) {
                        ring = NETMAP_RXRING(targ->nmd->nifp, j);
                        if (nm_ring_empty(ring))
                                continue;

                        limit = nm_ring_space(ring);
                        if (limit > targ->g->burst)
                                limit = targ->g->burst;

#if 0
                        /* Enable this if
                         *     1) we remove the early-return optimization from
                         *        the netmap poll implementation, or
                         *     2) pipes get NS_MOREFRAG support.
                         * With the current netmap implementation, an experiment like
                         *    pkt-gen -i vale:1{1 -f txseq -F 9
                         *    pkt-gen -i vale:1}1 -f rxseq
                         * would get stuck as soon as we find nm_ring_space(ring) < 9,
                         * since here limit is rounded to 0 and
                         * pipe rxsync is not called anymore by the poll() of this loop.
                         */
                        if (frags_exp > 1) {
                                int o = limit;
                                /* Cut off to the closest smaller multiple. */
                                limit = (limit / frags_exp) * frags_exp;
                                RD(2, "LIMIT %d --> %d", o, limit);
                        }
#endif

                        for (head = ring->head, i = 0; i < limit; i++) {
                                struct netmap_slot *slot = &ring->slot[head];
                                char *p = NETMAP_BUF(ring, slot->buf_idx);
                                int len = slot->len;
                                struct pkt *pkt;

                                if (dump) {
                                        dump_payload(p, slot->len, ring, head);
                                }

                                frags++;
                                if (!(slot->flags & NS_MOREFRAG)) {
                                        if (first_packet) {
                                                first_packet = 0;
                                        } else if (frags != frags_exp) {
                                                char prbuf[512];
                                                RD(1, "Received packets with %u frags, "
                                                                "expected %u, '%s'", frags, frags_exp,
                                                                multi_slot_to_string(ring, head-frags+1,
                                                                frags,
                                                                        prbuf, sizeof(prbuf)));
                                        }
                                        first_packet = 0;
                                        frags_exp = frags;
                                        frags = 0;
                                }

                                p -= sizeof(pkt->vh) - targ->g->virt_header;
                                len += sizeof(pkt->vh) - targ->g->virt_header;
                                pkt = (struct pkt *)p;
                                if (ntohs(pkt->eh.ether_type) == ETHERTYPE_IP)
                                        af = AF_INET;
                                else
                                        af = AF_INET6;

                                if ((char *)pkt + len < ((char *)PKT(pkt, body, af)) +
                                                sizeof(seq)) {
                                        RD(1, "%s: packet too small (len=%u)", __func__,
                                                        slot->len);
                                } else {
                                        seq = (PKT(pkt, body, af)[0] << 24) |
                                                (PKT(pkt, body, af)[1] << 16) |
                                                (PKT(pkt, body, af)[2] << 8) |
                                                PKT(pkt, body, af)[3];
                                        if (first_slot) {
                                                /* Grab the first one, whatever it
                                                   is. */
                                                seq_exp[j] = seq;
                                                first_slot = 0;
                                        } else if (seq != seq_exp[j]) {
                                                uint32_t delta = seq - seq_exp[j];

                                                if (delta < (0xFFFFFFFF >> 1)) {
                                                        RD(2, "Sequence GAP: exp %u found %u",
                                                                        seq_exp[j], seq);
                                                } else {
                                                        RD(2, "Sequence OUT OF ORDER: "
                                                                        "exp %u found %u", seq_exp[j], seq);
                                                }
                                                seq_exp[j] = seq;
                                        }
                                        seq_exp[j]++;
                                }

                                cur.bytes += slot->len;
                                head = nm_ring_next(ring, head);
                                cur.pkts++;
                        }

                        ring->cur = ring->head = head;

                        cur.events++;
                        targ->ctr = cur;
                }
        }
        clock_gettime(CLOCK_REALTIME_PRECISE, &targ->toc);

#ifndef BUSYWAIT
out:
#endif /* !BUSYWAIT */
        targ->completed = 1;
        targ->ctr = cur;

quit:
        if (seq_exp != NULL)
                free(seq_exp);
        /* reset the ``used`` flag. */
        targ->used = 0;

        return (NULL);
}


static void
tx_output(struct glob_arg *g, struct my_ctrs *cur, double delta, const char *msg)
{
        double bw, raw_bw, pps, abs;
        char b1[40], b2[80], b3[80];
        int size;

        if (cur->pkts == 0) {
                printf("%s nothing.\n", msg);
                return;
        }

        size = (int)(cur->bytes / cur->pkts);

        printf("%s %llu packets %llu bytes %llu events %d bytes each in %.2f seconds.\n",
                msg,
                (unsigned long long)cur->pkts,
                (unsigned long long)cur->bytes,
                (unsigned long long)cur->events, size, delta);
        if (delta == 0)
                delta = 1e-6;
        if (size < 60)          /* correct for min packet size */
                size = 60;
        pps = cur->pkts / delta;
        bw = (8.0 * cur->bytes) / delta;
        raw_bw = (8.0 * cur->bytes + cur->pkts * g->framing) / delta;
        abs = cur->pkts / (double)(cur->events);

        printf("Speed: %spps Bandwidth: %sbps (raw %sbps). Average batch: %.2f pkts\n",
                norm(b1, pps, normalize), norm(b2, bw, normalize), norm(b3, raw_bw, normalize), abs);
}

static void
usage(int errcode)
{
/* This usage is generated from the pkt-gen man page:
 *   $ man pkt-gen > x
 * and pasted here adding the string terminators and endlines with simple
 * regular expressions. */
        const char *cmd = "pkt-gen";
        fprintf(stderr,
                "Usage:\n"
                "%s arguments\n"
"     -h      Show program usage and exit.\n"
"\n"
"     -i interface\n"
"             Name of the network interface that pkt-gen operates on.  It can be a system network interface\n"
"             (e.g., em0), the name of a vale(4) port (e.g., valeSSS:PPP), the name of a netmap pipe or\n"
"             monitor, or any valid netmap port name accepted by the nm_open library function, as docu-\n"
"             mented in netmap(4) (NIOCREGIF section).\n"
"\n"
"     -f function\n"
"             The function to be executed by pkt-gen.  Specify tx for transmission, rx for reception, ping\n"
"             for client-side ping-pong operation, and pong for server-side ping-pong operation.\n"
"\n"
"     -n count\n"
"             Number of iterations of the pkt-gen function (with 0 meaning infinite).  In case of tx or rx,\n"
"             count is the number of packets to receive or transmit.  In case of ping or pong, count is the\n"
"             number of ping-pong transactions.\n"
"\n"
"     -l pkt_size\n"
"             Packet size in bytes excluding CRC.  If passed a second time, use random sizes larger or\n"
"             equal than the second one and lower than the first one.\n"
"\n"
"     -b burst_size\n"
"             Transmit or receive up to burst_size packets at a time.\n"
"\n"
"     -4      Use IPv4 addresses.\n"
"\n"
"     -6      Use IPv6 addresses.\n"
"\n"
"     -d dst_ip[:port[-dst_ip:port]]\n"
"             Destination IPv4/IPv6 address and port, single or range.\n"
"\n"
"     -s src_ip[:port[-src_ip:port]]\n"
"             Source IPv4/IPv6 address and port, single or range.\n"
"\n"
"     -D dst_mac\n"
"             Destination MAC address in colon notation (e.g., aa:bb:cc:dd:ee:00).\n"
"\n"
"     -S src_mac\n"
"             Source MAC address in colon notation.\n"
"\n"
"     -a cpu_id\n"
"             Pin the first thread of pkt-gen to a particular CPU using pthread_setaffinity_np(3).  If more\n"
"             threads are used, they are pinned to the subsequent CPUs, one per thread.\n"
"\n"
"     -c cpus\n"
"             Maximum number of CPUs to use (0 means to use all the available ones).\n"
"\n"
"     -p threads\n"
"             Number of threads to use.  By default, only a single thread is used to handle all the netmap\n"
"             rings.  If threads is larger than one, each thread handles a single TX ring (in tx mode), a\n"
"             single RX ring (in rx mode), or a TX/RX ring pair.  The number of threads must be less than or\n"
"             equal to the number of TX (or RX) rings available in the device specified by interface.\n"
"\n"
"     -T report_ms\n"
"             Number of milliseconds between reports.\n"
"\n"
"     -w wait_for_link_time\n"
"             Number of seconds to wait before starting the pkt-gen function, useful to make sure that the\n"
"             network link is up.  A network device driver may take some time to enter netmap mode, or to\n"
"             create a new transmit/receive ring pair when netmap(4) requests one.\n"
"\n"
"     -R rate\n"
"             Packet transmission rate.  Not setting the packet transmission rate tells pkt-gen to transmit\n"
"             packets as quickly as possible.  On servers from 2010 onward netmap(4) is able to com-\n"
"             pletely use all of the bandwidth of a 10 or 40Gbps link, so this option should be used unless\n"
"             your intention is to saturate the link.\n"
"\n"
"     -X      Dump payload of each packet transmitted or received.\n"
"\n"
"     -H len  Add empty virtio-net-header with size 'len'.  Valid sizes are 0, 10 and 12.  This option is\n"
"             only used with Virtual Machine technologies that use virtio as a network interface.\n"
"\n"
"     -P file\n"
"             Load the packet to be transmitted from a pcap file rather than constructing it within\n"
"             pkt-gen.\n"
"\n"
"     -z      Use random IPv4/IPv6 src address/port.\n"
"\n"
"     -Z      Use random IPv4/IPv6 dst address/port.\n"
"\n"
"     -N      Do not normalize units (i.e., use bps, pps instead of Mbps, Kpps, etc.).\n"
"\n"
"     -F num_frags\n"
"             Send multi-slot packets, each one with num_frags fragments.  A multi-slot packet is repre-\n"
"             sented by two or more consecutive netmap slots with the NS_MOREFRAG flag set (except for the\n"
"             last slot).  This is useful to transmit or receive packets larger than the netmap buffer\n"
"             size.\n"
"\n"
"     -M frag_size\n"
"             In multi-slot mode, frag_size specifies the size of each fragment, if smaller than the packet\n"
"             length divided by num_frags.\n"
"\n"
"     -I      Use indirect buffers.  It is only valid for transmitting on VALE ports, and it is implemented\n"
"             by setting the NS_INDIRECT flag in the netmap slots.\n"
"\n"
"     -W      Exit immediately if all the RX rings are empty the first time they are examined.\n"
"\n"
"     -v      Increase the verbosity level.\n"
"\n"
"     -r      In tx mode, do not initialize packets, but send whatever the content of the uninitialized\n"
"             netmap buffers is (rubbish mode).\n"
"\n"
"     -A      Compute mean and standard deviation (over a sliding window) for the transmit or receive rate.\n"
"\n"
"     -B      Take Ethernet framing and CRC into account when computing the average bps.  This adds 4 bytes\n"
"             of CRC and 20 bytes of framing to each packet.\n"
"\n"
"     -C tx_slots[,rx_slots[,tx_rings[,rx_rings]]]\n"
"             Configuration in terms of number of rings and slots to be used when opening the netmap port.\n"
"             Such configuration has an effect on software ports created on the fly, such as VALE ports and\n"
"             netmap pipes.  The configuration may consist of 1 to 4 numbers separated by commas: tx_slots,\n"
"             rx_slots, tx_rings, rx_rings.  Missing numbers or zeroes stand for default values.  As an\n"
"             additional convenience, if exactly one number is specified, then this is assigned to both\n"
"             tx_slots and rx_slots.  If there is no fourth number, then the third one is assigned to both\n"
"             tx_rings and rx_rings.\n"
"\n"
"     -o options                data generation options (parsed using atoi)\n"
"                               OPT_PREFETCH    1\n"
"                               OPT_ACCESS      2\n"
"                               OPT_COPY        4\n"
"                               OPT_MEMCPY      8\n"
"                               OPT_TS          16 (add a timestamp)\n"
"                               OPT_INDIRECT    32 (use indirect buffers)\n"
"                               OPT_DUMP        64 (dump rx/tx traffic)\n"
"                               OPT_RUBBISH     256\n"
"                                       (send whatever the buffers contain)\n"
"                               OPT_RANDOM_SRC  512\n"
"                               OPT_RANDOM_DST  1024\n"
"                               OPT_PPS_STATS   2048\n"
"                               OPT_UPDATE_CSUM 4096\n"
                     "",
                cmd);
        exit(errcode);
}

static int
start_threads(struct glob_arg *g) {
        int i;

        targs = calloc(g->nthreads, sizeof(*targs));
        struct targ *t;
        /*
         * Now create the desired number of threads, each one
         * using a single descriptor.
         */
        for (i = 0; i < g->nthreads; i++) {
                uint64_t seed = (uint64_t)time(0) | ((uint64_t)time(0) << 32);
                t = &targs[i];

                bzero(t, sizeof(*t));
                t->fd = -1; /* default, with pcap */
                t->g = g;
                memcpy(t->seed, &seed, sizeof(t->seed));

                if (g->dev_type == DEV_NETMAP) {
                        int m = -1;

                        /*
                         * if the user wants both HW and SW rings, we need to
                         * know when to switch from NR_REG_ONE_NIC to NR_REG_ONE_SW
                         */
                        if (g->orig_mode == NR_REG_NIC_SW) {
                                m = (g->td_type == TD_TYPE_RECEIVER ?
                                                g->nmd->reg.nr_rx_rings :
                                                g->nmd->reg.nr_tx_rings);
                        }

                        if (i > 0) {
                                int j;
                                /* the first thread uses the fd opened by the main
                                 * thread, the other threads re-open /dev/netmap
                                 */
                                t->nmd = nmport_clone(g->nmd);
                                if (t->nmd == NULL)
                                        return -1;

                                j = i;
                                if (m > 0 && j >= m) {
                                        /* switch to the software rings */
                                        t->nmd->reg.nr_mode = NR_REG_ONE_SW;
                                        j -= m;
                                }
                                t->nmd->reg.nr_ringid = j & NETMAP_RING_MASK;
                                /* Only touch one of the rings (rx is already ok) */
                                if (g->td_type == TD_TYPE_RECEIVER)
                                        t->nmd->reg.nr_flags |= NETMAP_NO_TX_POLL;

                                /* register interface. Override ifname and ringid etc. */
                                if (nmport_open_desc(t->nmd) < 0) {
                                        nmport_undo_prepare(t->nmd);
                                        t->nmd = NULL;
                                        return -1;
                                }
                        } else {
                                t->nmd = g->nmd;
                        }
                        t->fd = t->nmd->fd;
                        t->frags = g->frags;
                } else {
                        targs[i].fd = g->main_fd;
                }
                t->used = 1;
                t->me = i;
                if (g->affinity >= 0) {
                        t->affinity = (g->affinity + i) % g->cpus;
                } else {
                        t->affinity = -1;
                }
                /* default, init packets */
                initialize_packet(t);
        }
        /* Wait for PHY reset. */
        D("Wait %d secs for phy reset", g->wait_link);
        sleep(g->wait_link);
        D("Ready...");

        for (i = 0; i < g->nthreads; i++) {
                t = &targs[i];
                if (pthread_create(&t->thread, NULL, g->td_body, t) == -1) {
                        D("Unable to create thread %d: %s", i, strerror(errno));
                        t->used = 0;
                }
        }
        return 0;
}

static void
main_thread(struct glob_arg *g)
{
        int i;

        struct my_ctrs prev, cur;
        double delta_t;
        struct timeval tic, toc;

        prev.pkts = prev.bytes = prev.events = 0;
        gettimeofday(&prev.t, NULL);
        for (;;) {
                char b1[40], b2[40], b3[40], b4[100];
                uint64_t pps, usec;
                struct my_ctrs x;
                double abs;
                int done = 0;

                usec = wait_for_next_report(&prev.t, &cur.t,
                                g->report_interval);

                cur.pkts = cur.bytes = cur.events = 0;
                cur.min_space = 0;
                if (usec < 10000) /* too short to be meaningful */
                        continue;
                /* accumulate counts for all threads */
                for (i = 0; i < g->nthreads; i++) {
                        cur.pkts += targs[i].ctr.pkts;
                        cur.bytes += targs[i].ctr.bytes;
                        cur.events += targs[i].ctr.events;
                        cur.min_space += targs[i].ctr.min_space;
                        targs[i].ctr.min_space = 99999;
                        if (targs[i].used == 0)
                                done++;
                }
                x.pkts = cur.pkts - prev.pkts;
                x.bytes = cur.bytes - prev.bytes;
                x.events = cur.events - prev.events;
                pps = (x.pkts*1000000 + usec/2) / usec;
                abs = (x.events > 0) ? (x.pkts / (double) x.events) : 0;

                if (!(g->options & OPT_PPS_STATS)) {
                        strcpy(b4, "");
                } else {
                        /* Compute some pps stats using a sliding window. */
                        double ppsavg = 0.0, ppsdev = 0.0;
                        int nsamples = 0;

                        g->win[g->win_idx] = pps;
                        g->win_idx = (g->win_idx + 1) % STATS_WIN;

                        for (i = 0; i < STATS_WIN; i++) {
                                ppsavg += g->win[i];
                                if (g->win[i]) {
                                        nsamples ++;
                                }
                        }
                        ppsavg /= nsamples;

                        for (i = 0; i < STATS_WIN; i++) {
                                if (g->win[i] == 0) {
                                        continue;
                                }
                                ppsdev += (g->win[i] - ppsavg) * (g->win[i] - ppsavg);
                        }
                        ppsdev /= nsamples;
                        ppsdev = sqrt(ppsdev);

                        snprintf(b4, sizeof(b4), "[avg/std %s/%s pps]",
                                 norm(b1, ppsavg, normalize), norm(b2, ppsdev, normalize));
                }

                D("%spps %s(%spkts %sbps in %llu usec) %.2f avg_batch %d min_space",
                        norm(b1, pps, normalize), b4,
                        norm(b2, (double)x.pkts, normalize),
                        norm(b3, 1000000*((double)x.bytes*8+(double)x.pkts*g->framing)/usec, normalize),
                        (unsigned long long)usec,
                        abs, (int)cur.min_space);
                prev = cur;

                if (done == g->nthreads)
                        break;
        }

        timerclear(&tic);
        timerclear(&toc);
        cur.pkts = cur.bytes = cur.events = 0;
        /* final round */
        for (i = 0; i < g->nthreads; i++) {
                struct timespec t_tic, t_toc;
                /*
                 * Join active threads, unregister interfaces and close
                 * file descriptors.
                 */
                if (targs[i].used)
                        pthread_join(targs[i].thread, NULL); /* blocking */
                if (g->dev_type == DEV_NETMAP) {
                        nmport_close(targs[i].nmd);
                        targs[i].nmd = NULL;
                } else {
                        close(targs[i].fd);
                }

                if (targs[i].completed == 0)
                        D("ouch, thread %d exited with error", i);

                /*
                 * Collect threads output and extract information about
                 * how long it took to send all the packets.
                 */
                cur.pkts += targs[i].ctr.pkts;
                cur.bytes += targs[i].ctr.bytes;
                cur.events += targs[i].ctr.events;
                /* collect the largest start (tic) and end (toc) times,
                 * XXX maybe we should do the earliest tic, or do a weighted
                 * average ?
                 */
                t_tic = timeval2spec(&tic);
                t_toc = timeval2spec(&toc);
                if (!timerisset(&tic) || timespec_ge(&targs[i].tic, &t_tic))
                        tic = timespec2val(&targs[i].tic);
                if (!timerisset(&toc) || timespec_ge(&targs[i].toc, &t_toc))
                        toc = timespec2val(&targs[i].toc);
        }

        /* print output. */
        timersub(&toc, &tic, &toc);
        delta_t = toc.tv_sec + 1e-6* toc.tv_usec;
        if (g->td_type == TD_TYPE_SENDER)
                tx_output(g, &cur, delta_t, "Sent");
        else if (g->td_type == TD_TYPE_RECEIVER)
                tx_output(g, &cur, delta_t, "Received");
}

struct td_desc {
        int ty;
        const char *key;
        void *f;
        int default_burst;
};

static struct td_desc func[] = {
        { TD_TYPE_RECEIVER,     "rx",           receiver_body,  512},   /* default */
        { TD_TYPE_SENDER,       "tx",           sender_body,    512 },
        { TD_TYPE_OTHER,        "ping",         ping_body,      1 },
        { TD_TYPE_OTHER,        "pong",         pong_body,      1 },
        { TD_TYPE_SENDER,       "txseq",        txseq_body,     512 },
        { TD_TYPE_RECEIVER,     "rxseq",        rxseq_body,     512 },
        { 0,                    NULL,           NULL,           0 }
};

static int
tap_alloc(char *dev)
{
        struct ifreq ifr;
        int fd, err;
        const char *clonedev = TAP_CLONEDEV;

        (void)err;
        (void)dev;
        /* Arguments taken by the function:
         *
         * char *dev: the name of an interface (or '\0'). MUST have enough
         *   space to hold the interface name if '\0' is passed
         * int flags: interface flags (eg, IFF_TUN etc.)
         */

#ifdef __FreeBSD__
        if (dev[3]) { /* tapSomething */
                static char buf[128];
                snprintf(buf, sizeof(buf), "/dev/%s", dev);
                clonedev = buf;
        }
#endif
        /* open the device */
        if( (fd = open(clonedev, O_RDWR)) < 0 ) {
                return fd;
        }
        D("%s open successful", clonedev);

        /* preparation of the struct ifr, of type "struct ifreq" */
        memset(&ifr, 0, sizeof(ifr));

#ifdef linux
        ifr.ifr_flags = IFF_TAP | IFF_NO_PI;

        if (*dev) {
                /* if a device name was specified, put it in the structure; otherwise,
                * the kernel will try to allocate the "next" device of the
                * specified type */
                size_t len = strlen(dev);
                if (len > IFNAMSIZ) {
                        D("%s too long", dev);
                        return -1;
                }
                memcpy(ifr.ifr_name, dev, len);
        }

        /* try to create the device */
        if( (err = ioctl(fd, TUNSETIFF, (void *) &ifr)) < 0 ) {
                D("failed to do a TUNSETIFF: %s", strerror(errno));
                close(fd);
                return err;
        }

        /* if the operation was successful, write back the name of the
        * interface to the variable "dev", so the caller can know
        * it. Note that the caller MUST reserve space in *dev (see calling
        * code below) */
        strcpy(dev, ifr.ifr_name);
        D("new name is %s", dev);
#endif /* linux */

        /* this is the special file descriptor that the caller will use to talk
         * with the virtual interface */
        return fd;
}

int
main(int arc, char **argv)
{
        int i;
        struct sigaction sa;
        sigset_t ss;

        struct glob_arg g;

        int ch;
        int devqueues = 1;      /* how many device queues */
        int wait_link_arg = 0;

        int pkt_size_done = 0;

        struct td_desc *fn = func;

        bzero(&g, sizeof(g));

        g.main_fd = -1;
        g.td_body = fn->f;
        g.td_type = fn->ty;
        g.report_interval = 1000;       /* report interval */
        g.affinity = -1;
        /* ip addresses can also be a range x.x.x.x-x.x.x.y */
        g.af = AF_INET;         /* default */
        g.src_ip.name = "10.0.0.1";
        g.dst_ip.name = "10.1.0.1";
        g.dst_mac.name = "ff:ff:ff:ff:ff:ff";
        g.src_mac.name = NULL;
        g.pkt_size = 60;
        g.pkt_min_size = 0;
        g.nthreads = 1;
        g.cpus = 1;             /* default */
        g.forever = 1;
        g.tx_rate = 0;
        g.frags = 1;
        g.frag_size = (u_int)-1;        /* use the netmap buffer size by default */
        g.nmr_config = "";
        g.virt_header = 0;
        g.wait_link = 2;        /* wait 2 seconds for physical ports */

        while ((ch = getopt(arc, argv, "46a:f:F:Nn:i:Il:d:s:D:S:b:c:o:p:"
            "T:w:WvR:XC:H:rP:zZAhBM:")) != -1) {

                switch(ch) {
                default:
                        D("bad option %c %s", ch, optarg);
                        usage(-1);
                        break;

                case 'h':
                        usage(0);
                        break;

                case '4':
                        g.af = AF_INET;
                        break;

                case '6':
                        g.af = AF_INET6;
                        break;

                case 'N':
                        normalize = 0;
                        break;

                case 'n':
                        g.npackets = strtoull(optarg, NULL, 10);
                        break;

                case 'F':
                        i = atoi(optarg);
                        if (i < 1 || i > 63) {
                                D("invalid frags %d [1..63], ignore", i);
                                break;
                        }
                        g.frags = i;
                        break;

                case 'M':
                        g.frag_size = atoi(optarg);
                        break;

                case 'f':
                        for (fn = func; fn->key; fn++) {
                                if (!strcmp(fn->key, optarg))
                                        break;
                        }
                        if (fn->key) {
                                g.td_body = fn->f;
                                g.td_type = fn->ty;
                        } else {
                                D("unrecognised function %s", optarg);
                        }
                        break;

                case 'o':       /* data generation options */
                        g.options |= atoi(optarg);
                        break;

                case 'a':       /* force affinity */
                        g.affinity = atoi(optarg);
                        break;

                case 'i':       /* interface */
                        /* a prefix of tap: netmap: or pcap: forces the mode.
                         * otherwise we guess
                         */
                        D("interface is %s", optarg);
                        if (strlen(optarg) > MAX_IFNAMELEN - 8) {
                                D("ifname too long %s", optarg);
                                break;
                        }
                        strcpy(g.ifname, optarg);
                        if (!strcmp(optarg, "null")) {
                                g.dev_type = DEV_NETMAP;
                                g.dummy_send = 1;
                        } else if (!strncmp(optarg, "tap:", 4)) {
                                g.dev_type = DEV_TAP;
                                strcpy(g.ifname, optarg + 4);
                        } else if (!strncmp(optarg, "pcap:", 5)) {
                                g.dev_type = DEV_PCAP;
                                strcpy(g.ifname, optarg + 5);
                        } else if (!strncmp(optarg, "netmap:", 7) ||
                                   !strncmp(optarg, "vale", 4)) {
                                g.dev_type = DEV_NETMAP;
                        } else if (!strncmp(optarg, "tap", 3)) {
                                g.dev_type = DEV_TAP;
                        } else { /* prepend netmap: */
                                g.dev_type = DEV_NETMAP;
                                sprintf(g.ifname, "netmap:%s", optarg);
                        }
                        break;

                case 'I':
                        g.options |= OPT_INDIRECT;      /* use indirect buffers */
                        break;

                case 'l':       /* pkt_size */
                        if (pkt_size_done) {
                                g.pkt_min_size = atoi(optarg);
                        } else {
                                g.pkt_size = atoi(optarg);
                                pkt_size_done = 1;
                        }
                        break;

                case 'd':
                        g.dst_ip.name = optarg;
                        break;

                case 's':
                        g.src_ip.name = optarg;
                        break;

                case 'T':       /* report interval */
                        g.report_interval = atoi(optarg);
                        break;

                case 'w':
                        g.wait_link = atoi(optarg);
                        wait_link_arg = 1;
                        break;

                case 'W':
                        g.forever = 0; /* exit RX with no traffic */
                        break;

                case 'b':       /* burst */
                        g.burst = atoi(optarg);
                        break;
                case 'c':
                        g.cpus = atoi(optarg);
                        break;
                case 'p':
                        g.nthreads = atoi(optarg);
                        break;

                case 'D': /* destination mac */
                        g.dst_mac.name = optarg;
                        break;

                case 'S': /* source mac */
                        g.src_mac.name = optarg;
                        break;
                case 'v':
                        verbose++;
                        break;
                case 'R':
                        g.tx_rate = atoi(optarg);
                        break;
                case 'X':
                        g.options |= OPT_DUMP;
                        break;
                case 'C':
                        D("WARNING: the 'C' option is deprecated, use the '+conf:' libnetmap option instead");
                        g.nmr_config = strdup(optarg);
                        break;
                case 'H':
                        g.virt_header = atoi(optarg);
                        break;
                case 'P':
                        g.packet_file = strdup(optarg);
                        break;
                case 'r':
                        g.options |= OPT_RUBBISH;
                        break;
                case 'z':
                        g.options |= OPT_RANDOM_SRC;
                        break;
                case 'Z':
                        g.options |= OPT_RANDOM_DST;
                        break;
                case 'A':
                        g.options |= OPT_PPS_STATS;
                        break;
                case 'B':
                        /* raw packets have4 bytes crc + 20 bytes framing */
                        // XXX maybe add an option to pass the IFG
                        g.framing = 24 * 8;
                        break;
                }
        }

        if (strlen(g.ifname) <=0 ) {
                D("missing ifname");
                usage(-1);
        }

        if (g.burst == 0) {
                g.burst = fn->default_burst;
                D("using default burst size: %d", g.burst);
        }

        g.system_cpus = i = system_ncpus();
        if (g.cpus < 0 || g.cpus > i) {
                D("%d cpus is too high, have only %d cpus", g.cpus, i);
                usage(-1);
        }
        D("running on %d cpus (have %d)", g.cpus, i);
        if (g.cpus == 0)
                g.cpus = i;

        if (!wait_link_arg && !strncmp(g.ifname, "vale", 4)) {
                g.wait_link = 0;
        }

        if (g.pkt_size < 16 || g.pkt_size > MAX_PKTSIZE) {
                D("bad pktsize %d [16..%d]\n", g.pkt_size, MAX_PKTSIZE);
                usage(-1);
        }

        if (g.pkt_min_size > 0 && (g.pkt_min_size < 16 || g.pkt_min_size > g.pkt_size)) {
                D("bad pktminsize %d [16..%d]\n", g.pkt_min_size, g.pkt_size);
                usage(-1);
        }

        if (g.src_mac.name == NULL) {
                static char mybuf[20] = "00:00:00:00:00:00";
                /* retrieve source mac address. */
                if (source_hwaddr(g.ifname, mybuf) == -1) {
                        D("Unable to retrieve source mac");
                        // continue, fail later
                }
                g.src_mac.name = mybuf;
        }
        /* extract address ranges */
        if (extract_mac_range(&g.src_mac) || extract_mac_range(&g.dst_mac))
                usage(-1);
        g.options |= extract_ip_range(&g.src_ip, g.af);
        g.options |= extract_ip_range(&g.dst_ip, g.af);

        if (g.virt_header != 0 && g.virt_header != VIRT_HDR_1
                        && g.virt_header != VIRT_HDR_2) {
                D("bad virtio-net-header length");
                usage(-1);
        }

    if (g.dev_type == DEV_TAP) {
        D("want to use tap %s", g.ifname);
        g.main_fd = tap_alloc(g.ifname);
        if (g.main_fd < 0) {
                D("cannot open tap %s", g.ifname);
                usage(-1);
        }
#ifndef NO_PCAP
    } else if (g.dev_type == DEV_PCAP) {
        char pcap_errbuf[PCAP_ERRBUF_SIZE];

        pcap_errbuf[0] = '\0'; // init the buffer
        g.p = pcap_open_live(g.ifname, 256 /* XXX */, 1, 100, pcap_errbuf);
        if (g.p == NULL) {
                D("cannot open pcap on %s", g.ifname);
                usage(-1);
        }
        g.main_fd = pcap_fileno(g.p);
        D("using pcap on %s fileno %d", g.ifname, g.main_fd);
#endif /* !NO_PCAP */
    } else if (g.dummy_send) { /* but DEV_NETMAP */
        D("using a dummy send routine");
    } else {
        g.nmd = nmport_prepare(g.ifname);
        if (g.nmd == NULL)
                goto out;

        parse_nmr_config(g.nmr_config, &g.nmd->reg);

        g.nmd->reg.nr_flags |= NR_ACCEPT_VNET_HDR;

        /*
         * Open the netmap device using nm_open().
         *
         * protocol stack and may cause a reset of the card,
         * which in turn may take some time for the PHY to
         * reconfigure. We do the open here to have time to reset.
         */
        g.orig_mode = g.nmd->reg.nr_mode;
        if (g.nthreads > 1) {
                switch (g.orig_mode) {
                case NR_REG_ALL_NIC:
                case NR_REG_NIC_SW:
                        g.nmd->reg.nr_mode = NR_REG_ONE_NIC;
                        break;
                case NR_REG_SW:
                        g.nmd->reg.nr_mode = NR_REG_ONE_SW;
                        break;
                default:
                        break;
                }
                g.nmd->reg.nr_ringid = 0;
        }
        if (nmport_open_desc(g.nmd) < 0)
                goto out;
        g.main_fd = g.nmd->fd;
        ND("mapped %luKB at %p", (unsigned long)(g.nmd->req.nr_memsize>>10),
                                g.nmd->mem);

        if (g.virt_header) {
                /* Set the virtio-net header length, since the user asked
                 * for it explicitly. */
                set_vnet_hdr_len(&g);
        } else {
                /* Check whether the netmap port we opened requires us to send
                 * and receive frames with virtio-net header. */
                get_vnet_hdr_len(&g);
        }

        /* get num of queues in tx or rx */
        if (g.td_type == TD_TYPE_SENDER)
                devqueues = g.nmd->reg.nr_tx_rings + g.nmd->reg.nr_host_tx_rings;
        else
                devqueues = g.nmd->reg.nr_rx_rings + g.nmd->reg.nr_host_rx_rings;

        /* validate provided nthreads. */
        if (g.nthreads < 1 || g.nthreads > devqueues) {
                D("bad nthreads %d, have %d queues", g.nthreads, devqueues);
                // continue, fail later
        }

        if (g.td_type == TD_TYPE_SENDER) {
                int mtu = get_if_mtu(&g);

                if (mtu > 0 && g.pkt_size > mtu) {
                        D("pkt_size (%d) must be <= mtu (%d)",
                                g.pkt_size, mtu);
                        return -1;
                }
        }

        if (verbose) {
                struct netmap_if *nifp = g.nmd->nifp;
                struct nmreq_register *req = &g.nmd->reg;

                D("nifp at offset %"PRIu64" ntxqs %d nrxqs %d memid %d",
                    req->nr_offset, req->nr_tx_rings, req->nr_rx_rings,
                    req->nr_mem_id);
                for (i = 0; i < req->nr_tx_rings + req->nr_host_tx_rings; i++) {
                        struct netmap_ring *ring = NETMAP_TXRING(nifp, i);
                        D("   TX%d at offset %p slots %d", i,
                            (void *)((char *)ring - (char *)nifp), ring->num_slots);
                }
                for (i = 0; i < req->nr_rx_rings + req->nr_host_rx_rings; i++) {
                        struct netmap_ring *ring = NETMAP_RXRING(nifp, i);
                        D("   RX%d at offset %p slots %d", i,
                            (void *)((char *)ring - (char *)nifp), ring->num_slots);
                }
        }

        /* Print some debug information. */
        fprintf(stdout,
                "%s %s: %d queues, %d threads and %d cpus.\n",
                (g.td_type == TD_TYPE_SENDER) ? "Sending on" :
                        ((g.td_type == TD_TYPE_RECEIVER) ? "Receiving from" :
                        "Working on"),
                g.ifname,
                devqueues,
                g.nthreads,
                g.cpus);
        if (g.td_type == TD_TYPE_SENDER) {
                fprintf(stdout, "%s -> %s (%s -> %s)\n",
                        g.src_ip.name, g.dst_ip.name,
                        g.src_mac.name, g.dst_mac.name);
        }

out:
        /* Exit if something went wrong. */
        if (g.main_fd < 0) {
                D("aborting");
                usage(-1);
        }
    }


        if (g.options) {
                D("--- SPECIAL OPTIONS:%s%s%s%s%s%s\n",
                        g.options & OPT_PREFETCH ? " prefetch" : "",
                        g.options & OPT_ACCESS ? " access" : "",
                        g.options & OPT_MEMCPY ? " memcpy" : "",
                        g.options & OPT_INDIRECT ? " indirect" : "",
                        g.options & OPT_COPY ? " copy" : "",
                        g.options & OPT_RUBBISH ? " rubbish " : "");
        }

        g.tx_period.tv_sec = g.tx_period.tv_nsec = 0;
        if (g.tx_rate > 0) {
                /* try to have at least something every second,
                 * reducing the burst size to some 0.01s worth of data
                 * (but no less than one full set of fragments)
                 */
                uint64_t x;
                int lim = (g.tx_rate)/300;
                if (g.burst > lim)
                        g.burst = lim;
                if (g.burst == 0)
                        g.burst = 1;
                x = ((uint64_t)1000000000 * (uint64_t)g.burst) / (uint64_t) g.tx_rate;
                g.tx_period.tv_nsec = x;
                g.tx_period.tv_sec = g.tx_period.tv_nsec / 1000000000;
                g.tx_period.tv_nsec = g.tx_period.tv_nsec % 1000000000;
        }
        if (g.td_type == TD_TYPE_SENDER)
            D("Sending %d packets every  %jd.%09ld s",
                        g.burst, (intmax_t)g.tx_period.tv_sec, g.tx_period.tv_nsec);
        /* Install ^C handler. */
        global_nthreads = g.nthreads;
        sigemptyset(&ss);
        sigaddset(&ss, SIGINT);
        /* block SIGINT now, so that all created threads will inherit the mask */
        if (pthread_sigmask(SIG_BLOCK, &ss, NULL) < 0) {
                D("failed to block SIGINT: %s", strerror(errno));
        }
        if (start_threads(&g) < 0)
                return 1;
        /* Install the handler and re-enable SIGINT for the main thread */
        memset(&sa, 0, sizeof(sa));
        sa.sa_handler = sigint_h;
        if (sigaction(SIGINT, &sa, NULL) < 0) {
                D("failed to install ^C handler: %s", strerror(errno));
        }

        if (pthread_sigmask(SIG_UNBLOCK, &ss, NULL) < 0) {
                D("failed to re-enable SIGINT: %s", strerror(errno));
        }
        main_thread(&g);
        free(targs);
        return 0;
}

/* end of file */