root/usr/src/cmd/lockstat/lockstat.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <stdio.h>
#include <stddef.h>
#include <stdlib.h>
#include <stdarg.h>
#include <string.h>
#include <strings.h>
#include <ctype.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <limits.h>
#include <sys/types.h>
#include <sys/modctl.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <dtrace.h>
#include <sys/lockstat.h>
#include <alloca.h>
#include <signal.h>
#include <assert.h>

#define LOCKSTAT_OPTSTR "x:bths:n:d:i:l:f:e:ckwWgCHEATID:RpPo:V"

#define LS_MAX_STACK_DEPTH      50
#define LS_MAX_EVENTS           64

typedef struct lsrec {
        struct lsrec    *ls_next;       /* next in hash chain */
        uintptr_t       ls_lock;        /* lock address */
        uintptr_t       ls_caller;      /* caller address */
        uint32_t        ls_count;       /* cumulative event count */
        uint32_t        ls_event;       /* type of event */
        uintptr_t       ls_refcnt;      /* cumulative reference count */
        uint64_t        ls_time;        /* cumulative event duration */
        uint32_t        ls_hist[64];    /* log2(duration) histogram */
        uintptr_t       ls_stack[LS_MAX_STACK_DEPTH];
} lsrec_t;

typedef struct lsdata {
        struct lsrec    *lsd_next;      /* next available */
        int             lsd_count;      /* number of records */
} lsdata_t;

/*
 * Definitions for the types of experiments which can be run.  They are
 * listed in increasing order of memory cost and processing time cost.
 * The numerical value of each type is the number of bytes needed per record.
 */
#define LS_BASIC        offsetof(lsrec_t, ls_time)
#define LS_TIME         offsetof(lsrec_t, ls_hist[0])
#define LS_HIST         offsetof(lsrec_t, ls_stack[0])
#define LS_STACK(depth) offsetof(lsrec_t, ls_stack[depth])

static void report_stats(FILE *, lsrec_t **, size_t, uint64_t, uint64_t);
static void report_trace(FILE *, lsrec_t **);

extern int symtab_init(void);
extern char *addr_to_sym(uintptr_t, uintptr_t *, size_t *);
extern uintptr_t sym_to_addr(char *name);
extern size_t sym_size(char *name);
extern char *strtok_r(char *, const char *, char **);

#define DEFAULT_NRECS   10000
#define DEFAULT_HZ      97
#define MAX_HZ          1000
#define MIN_AGGSIZE     (16 * 1024)
#define MAX_AGGSIZE     (32 * 1024 * 1024)

static int g_stkdepth;
static int g_topn = INT_MAX;
static hrtime_t g_elapsed;
static int g_rates = 0;
static int g_pflag = 0;
static int g_Pflag = 0;
static int g_wflag = 0;
static int g_Wflag = 0;
static int g_cflag = 0;
static int g_kflag = 0;
static int g_gflag = 0;
static int g_Vflag = 0;
static int g_tracing = 0;
static size_t g_recsize;
static size_t g_nrecs;
static int g_nrecs_used;
static uchar_t g_enabled[LS_MAX_EVENTS];
static hrtime_t g_min_duration[LS_MAX_EVENTS];
static dtrace_hdl_t *g_dtp;
static char *g_predicate;
static char *g_ipredicate;
static char *g_prog;
static int g_proglen;
static int g_dropped;

typedef struct ls_event_info {
        char    ev_type;
        char    ev_lhdr[20];
        char    ev_desc[80];
        char    ev_units[10];
        char    ev_name[DTRACE_NAMELEN];
        char    *ev_predicate;
        char    *ev_acquire;
} ls_event_info_t;

static ls_event_info_t g_event_info[LS_MAX_EVENTS] = {
        { 'C',  "Lock", "Adaptive mutex spin",                  "nsec",
            "lockstat:::adaptive-spin" },
        { 'C',  "Lock", "Adaptive mutex block",                 "nsec",
            "lockstat:::adaptive-block" },
        { 'C',  "Lock", "Spin lock spin",                       "nsec",
            "lockstat:::spin-spin" },
        { 'C',  "Lock", "Thread lock spin",                     "nsec",
            "lockstat:::thread-spin" },
        { 'C',  "Lock", "R/W writer blocked by writer",         "nsec",
            "lockstat:::rw-block", "arg2 == 0 && arg3 == 1" },
        { 'C',  "Lock", "R/W writer blocked by readers",        "nsec",
            "lockstat:::rw-block", "arg2 == 0 && arg3 == 0 && arg4" },
        { 'C',  "Lock", "R/W reader blocked by writer",         "nsec",
            "lockstat:::rw-block", "arg2 != 0 && arg3 == 1" },
        { 'C',  "Lock", "R/W reader blocked by write wanted",   "nsec",
            "lockstat:::rw-block", "arg2 != 0 && arg3 == 0 && arg4" },
        { 'C',  "Lock", "Unknown event (type 8)",               "units" },
        { 'C',  "Lock", "Unknown event (type 9)",               "units" },
        { 'C',  "Lock", "Unknown event (type 10)",              "units" },
        { 'C',  "Lock", "Unknown event (type 11)",              "units" },
        { 'C',  "Lock", "Unknown event (type 12)",              "units" },
        { 'C',  "Lock", "Unknown event (type 13)",              "units" },
        { 'C',  "Lock", "Unknown event (type 14)",              "units" },
        { 'C',  "Lock", "Unknown event (type 15)",              "units" },
        { 'C',  "Lock", "Unknown event (type 16)",              "units" },
        { 'C',  "Lock", "Unknown event (type 17)",              "units" },
        { 'C',  "Lock", "Unknown event (type 18)",              "units" },
        { 'C',  "Lock", "Unknown event (type 19)",              "units" },
        { 'C',  "Lock", "Unknown event (type 20)",              "units" },
        { 'C',  "Lock", "Unknown event (type 21)",              "units" },
        { 'C',  "Lock", "Unknown event (type 22)",              "units" },
        { 'C',  "Lock", "Unknown event (type 23)",              "units" },
        { 'C',  "Lock", "Unknown event (type 24)",              "units" },
        { 'C',  "Lock", "Unknown event (type 25)",              "units" },
        { 'C',  "Lock", "Unknown event (type 26)",              "units" },
        { 'C',  "Lock", "Unknown event (type 27)",              "units" },
        { 'C',  "Lock", "Unknown event (type 28)",              "units" },
        { 'C',  "Lock", "Unknown event (type 29)",              "units" },
        { 'C',  "Lock", "Unknown event (type 30)",              "units" },
        { 'C',  "Lock", "Unknown event (type 31)",              "units" },
        { 'H',  "Lock", "Adaptive mutex hold",                  "nsec",
            "lockstat:::adaptive-release", NULL,
            "lockstat:::adaptive-acquire" },
        { 'H',  "Lock", "Spin lock hold",                       "nsec",
            "lockstat:::spin-release", NULL,
            "lockstat:::spin-acquire" },
        { 'H',  "Lock", "R/W writer hold",                      "nsec",
            "lockstat:::rw-release", "arg1 == 0",
            "lockstat:::rw-acquire" },
        { 'H',  "Lock", "R/W reader hold",                      "nsec",
            "lockstat:::rw-release", "arg1 != 0",
            "lockstat:::rw-acquire" },
        { 'H',  "Lock", "Unknown event (type 36)",              "units" },
        { 'H',  "Lock", "Unknown event (type 37)",              "units" },
        { 'H',  "Lock", "Unknown event (type 38)",              "units" },
        { 'H',  "Lock", "Unknown event (type 39)",              "units" },
        { 'H',  "Lock", "Unknown event (type 40)",              "units" },
        { 'H',  "Lock", "Unknown event (type 41)",              "units" },
        { 'H',  "Lock", "Unknown event (type 42)",              "units" },
        { 'H',  "Lock", "Unknown event (type 43)",              "units" },
        { 'H',  "Lock", "Unknown event (type 44)",              "units" },
        { 'H',  "Lock", "Unknown event (type 45)",              "units" },
        { 'H',  "Lock", "Unknown event (type 46)",              "units" },
        { 'H',  "Lock", "Unknown event (type 47)",              "units" },
        { 'H',  "Lock", "Unknown event (type 48)",              "units" },
        { 'H',  "Lock", "Unknown event (type 49)",              "units" },
        { 'H',  "Lock", "Unknown event (type 50)",              "units" },
        { 'H',  "Lock", "Unknown event (type 51)",              "units" },
        { 'H',  "Lock", "Unknown event (type 52)",              "units" },
        { 'H',  "Lock", "Unknown event (type 53)",              "units" },
        { 'H',  "Lock", "Unknown event (type 54)",              "units" },
        { 'H',  "Lock", "Unknown event (type 55)",              "units" },
        { 'I',  "CPU+PIL", "Profiling interrupt",               "nsec",
            "profile:::profile-97", NULL },
        { 'I',  "Lock", "Unknown event (type 57)",              "units" },
        { 'I',  "Lock", "Unknown event (type 58)",              "units" },
        { 'I',  "Lock", "Unknown event (type 59)",              "units" },
        { 'E',  "Lock", "Recursive lock entry detected",        "(N/A)",
            "lockstat:::rw-release", NULL, "lockstat:::rw-acquire" },
        { 'E',  "Lock", "Lockstat enter failure",               "(N/A)" },
        { 'E',  "Lock", "Lockstat exit failure",                "nsec"  },
        { 'E',  "Lock", "Lockstat record failure",              "(N/A)" },
};

static void
fail(int do_perror, const char *message, ...)
{
        va_list args;
        int save_errno = errno;

        va_start(args, message);
        (void) fprintf(stderr, "lockstat: ");
        (void) vfprintf(stderr, message, args);
        va_end(args);
        if (do_perror)
                (void) fprintf(stderr, ": %s", strerror(save_errno));
        (void) fprintf(stderr, "\n");
        exit(2);
}

static void
dfail(const char *message, ...)
{
        va_list args;

        va_start(args, message);
        (void) fprintf(stderr, "lockstat: ");
        (void) vfprintf(stderr, message, args);
        va_end(args);
        (void) fprintf(stderr, ": %s\n",
            dtrace_errmsg(g_dtp, dtrace_errno(g_dtp)));

        exit(2);
}

static void
show_events(char event_type, char *desc)
{
        int i, first = -1, last;

        for (i = 0; i < LS_MAX_EVENTS; i++) {
                ls_event_info_t *evp = &g_event_info[i];
                if (evp->ev_type != event_type ||
                    strncmp(evp->ev_desc, "Unknown event", 13) == 0)
                        continue;
                if (first == -1)
                        first = i;
                last = i;
        }

        (void) fprintf(stderr,
            "\n%s events (lockstat -%c or lockstat -e %d-%d):\n\n",
            desc, event_type, first, last);

        for (i = first; i <= last; i++)
                (void) fprintf(stderr,
                    "%4d = %s\n", i, g_event_info[i].ev_desc);
}

static void
usage(void)
{
        (void) fprintf(stderr,
            "Usage: lockstat [options] command [args]\n"
            "\nEvent selection options:\n\n"
            "  -C              watch contention events [on by default]\n"
            "  -E              watch error events [off by default]\n"
            "  -H              watch hold events [off by default]\n"
            "  -I              watch interrupt events [off by default]\n"
            "  -A              watch all lock events [equivalent to -CH]\n"
            "  -e event_list   only watch the specified events (shown below);\n"
            "                  <event_list> is a comma-separated list of\n"
            "                  events or ranges of events, e.g. 1,4-7,35\n"
            "  -i rate         interrupt rate for -I [default: %d Hz]\n"
            "\nData gathering options:\n\n"
            "  -b              basic statistics (lock, caller, event count)\n"
            "  -t              timing for all events [default]\n"
            "  -h              histograms for event times\n"
            "  -s depth        stack traces <depth> deep\n"
            "  -x opt[=val]    enable or modify DTrace options\n"
            "\nData filtering options:\n\n"
            "  -n nrecords     maximum number of data records [default: %d]\n"
            "  -l lock[,size]  only watch <lock>, which can be specified as a\n"
            "                  symbolic name or hex address; <size> defaults\n"
            "                  to the ELF symbol size if available, 1 if not\n"
            "  -f func[,size]  only watch events generated by <func>\n"
            "  -d duration     only watch events longer than <duration>\n"
            "  -T              trace (rather than sample) events\n"
            "\nData reporting options:\n\n"
            "  -c              coalesce lock data for arrays like pse_mutex[]\n"
            "  -k              coalesce PCs within functions\n"
            "  -g              show total events generated by function\n"
            "  -w              wherever: don't distinguish events by caller\n"
            "  -W              whichever: don't distinguish events by lock\n"
            "  -R              display rates rather than counts\n"
            "  -p              parsable output format (awk(1)-friendly)\n"
            "  -P              sort lock data by (count * avg_time) product\n"
            "  -D n            only display top <n> events of each type\n"
            "  -o filename     send output to <filename>\n",
            DEFAULT_HZ, DEFAULT_NRECS);

        show_events('C', "Contention");
        show_events('H', "Hold-time");
        show_events('I', "Interrupt");
        show_events('E', "Error");
        (void) fprintf(stderr, "\n");

        exit(1);
}

static int
lockcmp(lsrec_t *a, lsrec_t *b)
{
        int i;

        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        for (i = g_stkdepth - 1; i >= 0; i--) {
                if (a->ls_stack[i] < b->ls_stack[i])
                        return (-1);
                if (a->ls_stack[i] > b->ls_stack[i])
                        return (1);
        }

        if (a->ls_caller < b->ls_caller)
                return (-1);
        if (a->ls_caller > b->ls_caller)
                return (1);

        if (a->ls_lock < b->ls_lock)
                return (-1);
        if (a->ls_lock > b->ls_lock)
                return (1);

        return (0);
}

static int
countcmp(lsrec_t *a, lsrec_t *b)
{
        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        return (b->ls_count - a->ls_count);
}

static int
timecmp(lsrec_t *a, lsrec_t *b)
{
        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        if (a->ls_time < b->ls_time)
                return (1);
        if (a->ls_time > b->ls_time)
                return (-1);

        return (0);
}

static int
lockcmp_anywhere(lsrec_t *a, lsrec_t *b)
{
        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        if (a->ls_lock < b->ls_lock)
                return (-1);
        if (a->ls_lock > b->ls_lock)
                return (1);

        return (0);
}

static int
lock_and_count_cmp_anywhere(lsrec_t *a, lsrec_t *b)
{
        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        if (a->ls_lock < b->ls_lock)
                return (-1);
        if (a->ls_lock > b->ls_lock)
                return (1);

        return (b->ls_count - a->ls_count);
}

static int
sitecmp_anylock(lsrec_t *a, lsrec_t *b)
{
        int i;

        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        for (i = g_stkdepth - 1; i >= 0; i--) {
                if (a->ls_stack[i] < b->ls_stack[i])
                        return (-1);
                if (a->ls_stack[i] > b->ls_stack[i])
                        return (1);
        }

        if (a->ls_caller < b->ls_caller)
                return (-1);
        if (a->ls_caller > b->ls_caller)
                return (1);

        return (0);
}

static int
site_and_count_cmp_anylock(lsrec_t *a, lsrec_t *b)
{
        int i;

        if (a->ls_event < b->ls_event)
                return (-1);
        if (a->ls_event > b->ls_event)
                return (1);

        for (i = g_stkdepth - 1; i >= 0; i--) {
                if (a->ls_stack[i] < b->ls_stack[i])
                        return (-1);
                if (a->ls_stack[i] > b->ls_stack[i])
                        return (1);
        }

        if (a->ls_caller < b->ls_caller)
                return (-1);
        if (a->ls_caller > b->ls_caller)
                return (1);

        return (b->ls_count - a->ls_count);
}

static void
mergesort(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **a, lsrec_t **b, int n)
{
        int m = n / 2;
        int i, j;

        if (m > 1)
                mergesort(cmp, a, b, m);
        if (n - m > 1)
                mergesort(cmp, a + m, b + m, n - m);
        for (i = m; i > 0; i--)
                b[i - 1] = a[i - 1];
        for (j = m - 1; j < n - 1; j++)
                b[n + m - j - 2] = a[j + 1];
        while (i < j)
                *a++ = cmp(b[i], b[j]) < 0 ? b[i++] : b[j--];
        *a = b[i];
}

static void
coalesce(int (*cmp)(lsrec_t *, lsrec_t *), lsrec_t **lock, int n)
{
        int i, j;
        lsrec_t *target, *current;

        target = lock[0];

        for (i = 1; i < n; i++) {
                current = lock[i];
                if (cmp(current, target) != 0) {
                        target = current;
                        continue;
                }
                current->ls_event = LS_MAX_EVENTS;
                target->ls_count += current->ls_count;
                target->ls_refcnt += current->ls_refcnt;
                if (g_recsize < LS_TIME)
                        continue;
                target->ls_time += current->ls_time;
                if (g_recsize < LS_HIST)
                        continue;
                for (j = 0; j < 64; j++)
                        target->ls_hist[j] += current->ls_hist[j];
        }
}

static void
coalesce_symbol(uintptr_t *addrp)
{
        uintptr_t symoff;
        size_t symsize;

        if (addr_to_sym(*addrp, &symoff, &symsize) != NULL && symoff < symsize)
                *addrp -= symoff;
}

static void
predicate_add(char **pred, char *what, char *cmp, uintptr_t value)
{
        char *new;
        int len, newlen;

        if (what == NULL)
                return;

        if (*pred == NULL) {
                *pred = malloc(1);
                *pred[0] = '\0';
        }

        len = strlen(*pred);
        newlen = len + strlen(what) + 32 + strlen("( && )");
        new = malloc(newlen);

        if (*pred[0] != '\0') {
                if (cmp != NULL) {
                        (void) sprintf(new, "(%s) && (%s %s 0x%p)",
                            *pred, what, cmp, (void *)value);
                } else {
                        (void) sprintf(new, "(%s) && (%s)", *pred, what);
                }
        } else {
                if (cmp != NULL) {
                        (void) sprintf(new, "%s %s 0x%p",
                            what, cmp, (void *)value);
                } else {
                        (void) sprintf(new, "%s", what);
                }
        }

        free(*pred);
        *pred = new;
}

static void
predicate_destroy(char **pred)
{
        free(*pred);
        *pred = NULL;
}

static void
filter_add(char **filt, char *what, uintptr_t base, uintptr_t size)
{
        char buf[256], *c = buf, *new;
        int len, newlen;

        if (*filt == NULL) {
                *filt = malloc(1);
                *filt[0] = '\0';
        }

        (void) sprintf(c, "%s(%s >= 0x%p && %s < 0x%p)", *filt[0] != '\0' ?
            " || " : "", what, (void *)base, what, (void *)(base + size));

        newlen = (len = strlen(*filt) + 1) + strlen(c);
        new = malloc(newlen);
        bcopy(*filt, new, len);
        (void) strcat(new, c);
        free(*filt);
        *filt = new;
}

static void
filter_destroy(char **filt)
{
        free(*filt);
        *filt = NULL;
}

static void
dprog_add(const char *fmt, ...)
{
        va_list args;
        int size, offs;
        char c;

        va_start(args, fmt);
        size = vsnprintf(&c, 1, fmt, args) + 1;

        if (g_proglen == 0) {
                offs = 0;
        } else {
                offs = g_proglen - 1;
        }

        g_proglen = offs + size;

        if ((g_prog = realloc(g_prog, g_proglen)) == NULL)
                fail(1, "failed to reallocate program text");

        (void) vsnprintf(&g_prog[offs], size, fmt, args);
}

/*
 * This function may read like an open sewer, but keep in mind that programs
 * that generate other programs are rarely pretty.  If one has the unenviable
 * task of maintaining or -- worse -- extending this code, use the -V option
 * to examine the D program as generated by this function.
 */
static void
dprog_addevent(int event)
{
        ls_event_info_t *info = &g_event_info[event];
        char *pred = NULL;
        char stack[20];
        const char *arg0, *caller;
        char *arg1 = "arg1";
        char buf[80];
        hrtime_t dur;
        int depth;

        if (info->ev_name[0] == '\0')
                return;

        if (info->ev_type == 'I') {
                /*
                 * For interrupt events, arg0 (normally the lock pointer) is
                 * the CPU address plus the current pil, and arg1 (normally
                 * the number of nanoseconds) is the number of nanoseconds
                 * late -- and it's stored in arg2.
                 */
                arg0 = "(uintptr_t)curthread->t_cpu + \n"
                    "\t    curthread->t_cpu->cpu_profile_pil";
                caller = "(uintptr_t)arg0";
                arg1 = "arg2";
        } else {
                arg0 = "(uintptr_t)arg0";
                caller = "caller";
        }

        if (g_recsize > LS_HIST) {
                for (depth = 0; g_recsize > LS_STACK(depth); depth++)
                        continue;

                if (g_tracing) {
                        (void) sprintf(stack, "\tstack(%d);\n", depth);
                } else {
                        (void) sprintf(stack, ", stack(%d)", depth);
                }
        } else {
                (void) sprintf(stack, "");
        }

        if (info->ev_acquire != NULL) {
                /*
                 * If this is a hold event, we need to generate an additional
                 * clause for the acquire; the clause for the release will be
                 * generated with the aggregating statement, below.
                 */
                dprog_add("%s\n", info->ev_acquire);
                predicate_add(&pred, info->ev_predicate, NULL, 0);
                predicate_add(&pred, g_predicate, NULL, 0);
                if (pred != NULL)
                        dprog_add("/%s/\n", pred);

                dprog_add("{\n");
                (void) sprintf(buf, "self->ev%d[(uintptr_t)arg0]", event);

                if (info->ev_type == 'H') {
                        dprog_add("\t%s = timestamp;\n", buf);
                } else {
                        /*
                         * If this isn't a hold event, it's the recursive
                         * error event.  For this, we simply bump the
                         * thread-local, per-lock count.
                         */
                        dprog_add("\t%s++;\n", buf);
                }

                dprog_add("}\n\n");
                predicate_destroy(&pred);
                pred = NULL;

                if (info->ev_type == 'E') {
                        /*
                         * If this is the recursive lock error event, we need
                         * to generate an additional clause to decrement the
                         * thread-local, per-lock count.  This assures that we
                         * only execute the aggregating clause if we have
                         * recursive entry.
                         */
                        dprog_add("%s\n", info->ev_name);
                        dprog_add("/%s/\n{\n\t%s--;\n}\n\n", buf, buf);
                }

                predicate_add(&pred, buf, NULL, 0);

                if (info->ev_type == 'H') {
                        (void) sprintf(buf, "timestamp -\n\t    "
                            "self->ev%d[(uintptr_t)arg0]", event);
                }

                arg1 = buf;
        } else {
                predicate_add(&pred, info->ev_predicate, NULL, 0);
                if (info->ev_type != 'I')
                        predicate_add(&pred, g_predicate, NULL, 0);
                else
                        predicate_add(&pred, g_ipredicate, NULL, 0);
        }

        if ((dur = g_min_duration[event]) != 0)
                predicate_add(&pred, arg1, ">=", dur);

        dprog_add("%s\n", info->ev_name);

        if (pred != NULL)
                dprog_add("/%s/\n", pred);
        predicate_destroy(&pred);

        dprog_add("{\n");

        if (g_tracing) {
                dprog_add("\ttrace(%dULL);\n", event);
                dprog_add("\ttrace(%s);\n", arg0);
                dprog_add("\ttrace(%s);\n", caller);
                dprog_add(stack);
        } else {
                /*
                 * The ordering here is important:  when we process the
                 * aggregate, we count on the fact that @avg appears before
                 * @hist in program order to assure that @avg is assigned the
                 * first aggregation variable ID and @hist assigned the
                 * second; see the comment in process_aggregate() for details.
                 */
                dprog_add("\t@avg[%dULL, %s, %s%s] = avg(%s);\n",
                    event, arg0, caller, stack, arg1);

                if (g_recsize >= LS_HIST) {
                        dprog_add("\t@hist[%dULL, %s, %s%s] = quantize"
                            "(%s);\n", event, arg0, caller, stack, arg1);
                }
        }

        if (info->ev_acquire != NULL)
                dprog_add("\tself->ev%d[arg0] = 0;\n", event);

        dprog_add("}\n\n");
}

static void
dprog_compile()
{
        dtrace_prog_t *prog;
        dtrace_proginfo_t info;

        if (g_Vflag) {
                (void) fprintf(stderr, "lockstat: vvvv D program vvvv\n");
                (void) fputs(g_prog, stderr);
                (void) fprintf(stderr, "lockstat: ^^^^ D program ^^^^\n");
        }

        if ((prog = dtrace_program_strcompile(g_dtp, g_prog,
            DTRACE_PROBESPEC_NAME, 0, 0, NULL)) == NULL)
                dfail("failed to compile program");

        if (dtrace_program_exec(g_dtp, prog, &info) == -1)
                dfail("failed to enable probes");

        if (dtrace_go(g_dtp) != 0)
                dfail("couldn't start tracing");
}

static void
status_fire(int signal __unused)
{
}

static void
status_init(void)
{
        dtrace_optval_t val, status, agg;
        struct sigaction act;
        struct itimerspec ts;
        struct sigevent ev;
        timer_t tid;

        if (dtrace_getopt(g_dtp, "statusrate", &status) == -1)
                dfail("failed to get 'statusrate'");

        if (dtrace_getopt(g_dtp, "aggrate", &agg) == -1)
                dfail("failed to get 'statusrate'");

        /*
         * We would want to awaken at a rate that is the GCD of the statusrate
         * and the aggrate -- but that seems a bit absurd.  Instead, we'll
         * simply awaken at a rate that is the more frequent of the two, which
         * assures that we're never later than the interval implied by the
         * more frequent rate.
         */
        val = status < agg ? status : agg;

        (void) sigemptyset(&act.sa_mask);
        act.sa_flags = 0;
        act.sa_handler = status_fire;
        (void) sigaction(SIGUSR1, &act, NULL);

        ev.sigev_notify = SIGEV_SIGNAL;
        ev.sigev_signo = SIGUSR1;

        if (timer_create(CLOCK_REALTIME, &ev, &tid) == -1)
                dfail("cannot create CLOCK_REALTIME timer");

        ts.it_value.tv_sec = val / NANOSEC;
        ts.it_value.tv_nsec = val % NANOSEC;
        ts.it_interval = ts.it_value;

        if (timer_settime(tid, TIMER_RELTIME, &ts, NULL) == -1)
                dfail("cannot set time on CLOCK_REALTIME timer");
}

static void
status_check(void)
{
        if (!g_tracing && dtrace_aggregate_snap(g_dtp) != 0)
                dfail("failed to snap aggregate");

        if (dtrace_status(g_dtp) == -1)
                dfail("dtrace_status()");
}

static void
lsrec_fill(lsrec_t *lsrec, const dtrace_recdesc_t *rec, int nrecs, caddr_t data)
{
        bzero(lsrec, g_recsize);
        lsrec->ls_count = 1;

        if ((g_recsize > LS_HIST && nrecs < 4) || (nrecs < 3))
                fail(0, "truncated DTrace record");

        if (rec->dtrd_size != sizeof (uint64_t))
                fail(0, "bad event size in first record");

        /* LINTED - alignment */
        lsrec->ls_event = (uint32_t)*((uint64_t *)(data + rec->dtrd_offset));
        rec++;

        if (rec->dtrd_size != sizeof (uintptr_t))
                fail(0, "bad lock address size in second record");

        /* LINTED - alignment */
        lsrec->ls_lock = *((uintptr_t *)(data + rec->dtrd_offset));
        rec++;

        if (rec->dtrd_size != sizeof (uintptr_t))
                fail(0, "bad caller size in third record");

        /* LINTED - alignment */
        lsrec->ls_caller = *((uintptr_t *)(data + rec->dtrd_offset));
        rec++;

        if (g_recsize > LS_HIST) {
                int frames, i;
                pc_t *stack;

                frames = rec->dtrd_size / sizeof (pc_t);
                /* LINTED - alignment */
                stack = (pc_t *)(data + rec->dtrd_offset);

                for (i = 1; i < frames; i++)
                        lsrec->ls_stack[i - 1] = stack[i];
        }
}

/*ARGSUSED*/
static int
count_aggregate(const dtrace_aggdata_t *agg, void *arg)
{
        *((size_t *)arg) += 1;

        return (DTRACE_AGGWALK_NEXT);
}

static int
process_aggregate(const dtrace_aggdata_t *agg, void *arg)
{
        const dtrace_aggdesc_t *aggdesc = agg->dtada_desc;
        caddr_t data = agg->dtada_data;
        lsdata_t *lsdata = arg;
        lsrec_t *lsrec = lsdata->lsd_next;
        const dtrace_recdesc_t *rec;
        uint64_t *avg, *quantized;
        int i, j;

        assert(lsdata->lsd_count < g_nrecs);

        /*
         * Aggregation variable IDs are guaranteed to be generated in program
         * order, and they are guaranteed to start from DTRACE_AGGVARIDNONE
         * plus one.  As "avg" appears before "hist" in program order, we know
         * that "avg" will be allocated the first aggregation variable ID, and
         * "hist" will be allocated the second aggregation variable ID -- and
         * we therefore use the aggregation variable ID to differentiate the
         * cases.
         */
        if (aggdesc->dtagd_varid > DTRACE_AGGVARIDNONE + 1) {
                /*
                 * If this is the histogram entry.  We'll copy the quantized
                 * data into lc_hist, and jump over the rest.
                 */
                rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];

                if (aggdesc->dtagd_varid != DTRACE_AGGVARIDNONE + 2)
                        fail(0, "bad variable ID in aggregation record");

                if (rec->dtrd_size !=
                    DTRACE_QUANTIZE_NBUCKETS * sizeof (uint64_t))
                        fail(0, "bad quantize size in aggregation record");

                /* LINTED - alignment */
                quantized = (uint64_t *)(data + rec->dtrd_offset);

                for (i = DTRACE_QUANTIZE_ZEROBUCKET, j = 0;
                    i < DTRACE_QUANTIZE_NBUCKETS; i++, j++)
                        lsrec->ls_hist[j] = quantized[i];

                goto out;
        }

        lsrec_fill(lsrec, &aggdesc->dtagd_rec[1],
            aggdesc->dtagd_nrecs - 1, data);

        rec = &aggdesc->dtagd_rec[aggdesc->dtagd_nrecs - 1];

        if (rec->dtrd_size != 2 * sizeof (uint64_t))
                fail(0, "bad avg size in aggregation record");

        /* LINTED - alignment */
        avg = (uint64_t *)(data + rec->dtrd_offset);
        lsrec->ls_count = (uint32_t)avg[0];
        lsrec->ls_time = (uintptr_t)avg[1];

        if (g_recsize >= LS_HIST)
                return (DTRACE_AGGWALK_NEXT);

out:
        lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
        lsdata->lsd_count++;

        return (DTRACE_AGGWALK_NEXT);
}

static int
process_trace(const dtrace_probedata_t *pdata, void *arg)
{
        lsdata_t *lsdata = arg;
        lsrec_t *lsrec = lsdata->lsd_next;
        dtrace_eprobedesc_t *edesc = pdata->dtpda_edesc;
        caddr_t data = pdata->dtpda_data;

        if (lsdata->lsd_count >= g_nrecs)
                return (DTRACE_CONSUME_NEXT);

        lsrec_fill(lsrec, edesc->dtepd_rec, edesc->dtepd_nrecs, data);

        lsdata->lsd_next = (lsrec_t *)((uintptr_t)lsrec + g_recsize);
        lsdata->lsd_count++;

        return (DTRACE_CONSUME_NEXT);
}

static int
process_data(FILE *out, char *data)
{
        lsdata_t lsdata;

        /* LINTED - alignment */
        lsdata.lsd_next = (lsrec_t *)data;
        lsdata.lsd_count = 0;

        if (g_tracing) {
                if (dtrace_consume(g_dtp, out,
                    process_trace, NULL, &lsdata) != 0)
                        dfail("failed to consume buffer");

                return (lsdata.lsd_count);
        }

        if (dtrace_aggregate_walk_keyvarsorted(g_dtp,
            process_aggregate, &lsdata) != 0)
                dfail("failed to walk aggregate");

        return (lsdata.lsd_count);
}

/*ARGSUSED*/
static int
drophandler(const dtrace_dropdata_t *data, void *arg)
{
        g_dropped++;
        (void) fprintf(stderr, "lockstat: warning: %s", data->dtdda_msg);
        return (DTRACE_HANDLE_OK);
}

int
main(int argc, char **argv)
{
        char *data_buf;
        lsrec_t *lsp, **current, **first, **sort_buf, **merge_buf;
        FILE *out = stdout;
        int c;
        pid_t child;
        int status;
        int i, j;
        hrtime_t duration;
        char *addrp, *offp, *sizep, *evp, *lastp, *p;
        uintptr_t addr;
        size_t size, off;
        int events_specified = 0;
        int exec_errno = 0;
        uint32_t event;
        char *filt = NULL, *ifilt = NULL;
        static uint64_t ev_count[LS_MAX_EVENTS + 1];
        static uint64_t ev_time[LS_MAX_EVENTS + 1];
        dtrace_optval_t aggsize;
        char aggstr[10];
        long ncpus;
        int dynvar = 0;
        int err;

        if ((g_dtp = dtrace_open(DTRACE_VERSION, 0, &err)) == NULL) {
                fail(0, "cannot open dtrace library: %s",
                    dtrace_errmsg(NULL, err));
        }

        if (dtrace_handle_drop(g_dtp, &drophandler, NULL) == -1)
                dfail("couldn't establish drop handler");

        if (symtab_init() == -1)
                fail(1, "can't load kernel symbols");

        g_nrecs = DEFAULT_NRECS;

        while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
                switch (c) {
                case 'b':
                        g_recsize = LS_BASIC;
                        break;

                case 't':
                        g_recsize = LS_TIME;
                        break;

                case 'h':
                        g_recsize = LS_HIST;
                        break;

                case 's':
                        if (!isdigit(optarg[0]))
                                usage();
                        g_stkdepth = atoi(optarg);
                        if (g_stkdepth > LS_MAX_STACK_DEPTH)
                                fail(0, "max stack depth is %d",
                                    LS_MAX_STACK_DEPTH);
                        g_recsize = LS_STACK(g_stkdepth);
                        break;

                case 'n':
                        if (!isdigit(optarg[0]))
                                usage();
                        g_nrecs = atoi(optarg);
                        break;

                case 'd':
                        if (!isdigit(optarg[0]))
                                usage();
                        duration = atoll(optarg);

                        /*
                         * XXX -- durations really should be per event
                         * since the units are different, but it's hard
                         * to express this nicely in the interface.
                         * Not clear yet what the cleanest solution is.
                         */
                        for (i = 0; i < LS_MAX_EVENTS; i++)
                                if (g_event_info[i].ev_type != 'E')
                                        g_min_duration[i] = duration;

                        break;

                case 'i':
                        if (!isdigit(optarg[0]))
                                usage();
                        i = atoi(optarg);
                        if (i <= 0)
                                usage();
                        if (i > MAX_HZ)
                                fail(0, "max interrupt rate is %d Hz", MAX_HZ);

                        for (j = 0; j < LS_MAX_EVENTS; j++)
                                if (strcmp(g_event_info[j].ev_desc,
                                    "Profiling interrupt") == 0)
                                        break;

                        (void) sprintf(g_event_info[j].ev_name,
                            "profile:::profile-%d", i);
                        break;

                case 'l':
                case 'f':
                        addrp = strtok(optarg, ",");
                        sizep = strtok(NULL, ",");
                        addrp = strtok(optarg, ",+");
                        offp = strtok(NULL, ",");

                        size = sizep ? strtoul(sizep, NULL, 0) : 1;
                        off = offp ? strtoul(offp, NULL, 0) : 0;

                        if (addrp[0] == '0') {
                                addr = strtoul(addrp, NULL, 16) + off;
                        } else {
                                addr = sym_to_addr(addrp) + off;
                                if (sizep == NULL)
                                        size = sym_size(addrp) - off;
                                if (addr - off == 0)
                                        fail(0, "symbol '%s' not found", addrp);
                                if (size == 0)
                                        size = 1;
                        }


                        if (c == 'l') {
                                filter_add(&filt, "arg0", addr, size);
                        } else {
                                filter_add(&filt, "caller", addr, size);
                                filter_add(&ifilt, "arg0", addr, size);
                        }
                        break;

                case 'e':
                        evp = strtok_r(optarg, ",", &lastp);
                        while (evp) {
                                int ev1, ev2;
                                char *evp2;

                                (void) strtok(evp, "-");
                                evp2 = strtok(NULL, "-");
                                ev1 = atoi(evp);
                                ev2 = evp2 ? atoi(evp2) : ev1;
                                if ((uint_t)ev1 >= LS_MAX_EVENTS ||
                                    (uint_t)ev2 >= LS_MAX_EVENTS || ev1 > ev2)
                                        fail(0, "-e events out of range");
                                for (i = ev1; i <= ev2; i++)
                                        g_enabled[i] = 1;
                                evp = strtok_r(NULL, ",", &lastp);
                        }
                        events_specified = 1;
                        break;

                case 'c':
                        g_cflag = 1;
                        break;

                case 'k':
                        g_kflag = 1;
                        break;

                case 'w':
                        g_wflag = 1;
                        break;

                case 'W':
                        g_Wflag = 1;
                        break;

                case 'g':
                        g_gflag = 1;
                        break;

                case 'C':
                case 'E':
                case 'H':
                case 'I':
                        for (i = 0; i < LS_MAX_EVENTS; i++)
                                if (g_event_info[i].ev_type == c)
                                        g_enabled[i] = 1;
                        events_specified = 1;
                        break;

                case 'A':
                        for (i = 0; i < LS_MAX_EVENTS; i++)
                                if (strchr("CH", g_event_info[i].ev_type))
                                        g_enabled[i] = 1;
                        events_specified = 1;
                        break;

                case 'T':
                        g_tracing = 1;
                        break;

                case 'D':
                        if (!isdigit(optarg[0]))
                                usage();
                        g_topn = atoi(optarg);
                        break;

                case 'R':
                        g_rates = 1;
                        break;

                case 'p':
                        g_pflag = 1;
                        break;

                case 'P':
                        g_Pflag = 1;
                        break;

                case 'o':
                        if ((out = fopen(optarg, "w")) == NULL)
                                fail(1, "error opening file");
                        break;

                case 'V':
                        g_Vflag = 1;
                        break;

                default:
                        if (strchr(LOCKSTAT_OPTSTR, c) == NULL)
                                usage();
                }
        }

        if (filt != NULL) {
                predicate_add(&g_predicate, filt, NULL, 0);
                filter_destroy(&filt);
        }

        if (ifilt != NULL) {
                predicate_add(&g_ipredicate, ifilt, NULL, 0);
                filter_destroy(&ifilt);
        }

        if (g_recsize == 0) {
                if (g_gflag) {
                        g_stkdepth = LS_MAX_STACK_DEPTH;
                        g_recsize = LS_STACK(g_stkdepth);
                } else {
                        g_recsize = LS_TIME;
                }
        }

        if (g_gflag && g_recsize <= LS_STACK(0))
                fail(0, "'-g' requires at least '-s 1' data gathering");

        /*
         * Make sure the alignment is reasonable
         */
        g_recsize = -(-g_recsize & -sizeof (uint64_t));

        for (i = 0; i < LS_MAX_EVENTS; i++) {
                /*
                 * If no events were specified, enable -C.
                 */
                if (!events_specified && g_event_info[i].ev_type == 'C')
                        g_enabled[i] = 1;
        }

        for (i = 0; i < LS_MAX_EVENTS; i++) {
                if (!g_enabled[i])
                        continue;

                if (g_event_info[i].ev_acquire != NULL) {
                        /*
                         * If we've enabled a hold event, we must explicitly
                         * allocate dynamic variable space.
                         */
                        dynvar = 1;
                }

                dprog_addevent(i);
        }

        /*
         * Make sure there are remaining arguments to specify a child command
         * to execute.
         */
        if (argc <= optind)
                usage();

        if ((ncpus = sysconf(_SC_NPROCESSORS_ONLN)) == -1)
                dfail("couldn't determine number of online CPUs");

        /*
         * By default, we set our data buffer size to be the number of records
         * multiplied by the size of the record, doubled to account for some
         * DTrace slop and divided by the number of CPUs.  We silently clamp
         * the aggregation size at both a minimum and a maximum to prevent
         * absurdly low or high values.
         */
        if ((aggsize = (g_nrecs * g_recsize * 2) / ncpus) < MIN_AGGSIZE)
                aggsize = MIN_AGGSIZE;

        if (aggsize > MAX_AGGSIZE)
                aggsize = MAX_AGGSIZE;

        (void) sprintf(aggstr, "%lld", (long long)aggsize);

        if (!g_tracing) {
                if (dtrace_setopt(g_dtp, "bufsize", "4k") == -1)
                        dfail("failed to set 'bufsize'");

                if (dtrace_setopt(g_dtp, "aggsize", aggstr) == -1)
                        dfail("failed to set 'aggsize'");

                if (dynvar) {
                        /*
                         * If we're using dynamic variables, we set our
                         * dynamic variable size to be one megabyte per CPU,
                         * with a hard-limit of 32 megabytes.  This may still
                         * be too small in some cases, but it can be tuned
                         * manually via -x if need be.
                         */
                        (void) sprintf(aggstr, "%ldm", ncpus < 32 ? ncpus : 32);

                        if (dtrace_setopt(g_dtp, "dynvarsize", aggstr) == -1)
                                dfail("failed to set 'dynvarsize'");
                }
        } else {
                if (dtrace_setopt(g_dtp, "bufsize", aggstr) == -1)
                        dfail("failed to set 'bufsize'");
        }

        if (dtrace_setopt(g_dtp, "statusrate", "10sec") == -1)
                dfail("failed to set 'statusrate'");

        optind = 1;
        while ((c = getopt(argc, argv, LOCKSTAT_OPTSTR)) != EOF) {
                switch (c) {
                case 'x':
                        if ((p = strchr(optarg, '=')) != NULL)
                                *p++ = '\0';

                        if (dtrace_setopt(g_dtp, optarg, p) != 0)
                                dfail("failed to set -x %s", optarg);
                        break;
                }
        }

        argc -= optind;
        argv += optind;

        dprog_compile();
        status_init();

        g_elapsed = -gethrtime();

        /*
         * Spawn the specified command and wait for it to complete.
         */
        child = fork();
        if (child == -1)
                fail(1, "cannot fork");
        if (child == 0) {
                (void) dtrace_close(g_dtp);
                (void) execvp(argv[0], &argv[0]);
                exec_errno = errno;
                exit(127);
        }

        while (waitpid(child, &status, WEXITED) != child)
                status_check();

        g_elapsed += gethrtime();

        if (WIFEXITED(status)) {
                if (WEXITSTATUS(status) != 0) {
                        if (exec_errno != 0) {
                                errno = exec_errno;
                                fail(1, "could not execute %s", argv[0]);
                        }
                        (void) fprintf(stderr,
                            "lockstat: warning: %s exited with code %d\n",
                            argv[0], WEXITSTATUS(status));
                }
        } else {
                (void) fprintf(stderr,
                    "lockstat: warning: %s died on signal %d\n",
                    argv[0], WTERMSIG(status));
        }

        if (dtrace_stop(g_dtp) == -1)
                dfail("failed to stop dtrace");

        /*
         * Before we read out the results, we need to allocate our buffer.
         * If we're tracing, then we'll just use the precalculated size.  If
         * we're not, then we'll take a snapshot of the aggregate, and walk
         * it to count the number of records.
         */
        if (!g_tracing) {
                if (dtrace_aggregate_snap(g_dtp) != 0)
                        dfail("failed to snap aggregate");

                g_nrecs = 0;

                if (dtrace_aggregate_walk(g_dtp,
                    count_aggregate, &g_nrecs) != 0)
                        dfail("failed to walk aggregate");
        }

        if ((data_buf = memalign(sizeof (uint64_t),
            (g_nrecs + 1) * g_recsize)) == NULL)
                fail(1, "Memory allocation failed");

        /*
         * Read out the DTrace data.
         */
        g_nrecs_used = process_data(out, data_buf);

        if (g_nrecs_used > g_nrecs || g_dropped)
                (void) fprintf(stderr, "lockstat: warning: "
                    "ran out of data records (use -n for more)\n");

        /* LINTED - alignment */
        for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
            /* LINTED - alignment */
            lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
                ev_count[lsp->ls_event] += lsp->ls_count;
                ev_time[lsp->ls_event] += lsp->ls_time;
        }

        /*
         * If -g was specified, convert stacks into individual records.
         */
        if (g_gflag) {
                lsrec_t *newlsp, *oldlsp;

                newlsp = memalign(sizeof (uint64_t),
                    g_nrecs_used * LS_TIME * (g_stkdepth + 1));
                if (newlsp == NULL)
                        fail(1, "Cannot allocate space for -g processing");
                lsp = newlsp;
                /* LINTED - alignment */
                for (i = 0, oldlsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
                    /* LINTED - alignment */
                    oldlsp = (lsrec_t *)((char *)oldlsp + g_recsize)) {
                        int fr;
                        int caller_in_stack = 0;

                        if (oldlsp->ls_count == 0)
                                continue;

                        for (fr = 0; fr < g_stkdepth; fr++) {
                                if (oldlsp->ls_stack[fr] == 0)
                                        break;
                                if (oldlsp->ls_stack[fr] == oldlsp->ls_caller)
                                        caller_in_stack = 1;
                                bcopy(oldlsp, lsp, LS_TIME);
                                lsp->ls_caller = oldlsp->ls_stack[fr];
                                /* LINTED - alignment */
                                lsp = (lsrec_t *)((char *)lsp + LS_TIME);
                        }
                        if (!caller_in_stack) {
                                bcopy(oldlsp, lsp, LS_TIME);
                                /* LINTED - alignment */
                                lsp = (lsrec_t *)((char *)lsp + LS_TIME);
                        }
                }
                g_nrecs = g_nrecs_used =
                    ((uintptr_t)lsp - (uintptr_t)newlsp) / LS_TIME;
                g_recsize = LS_TIME;
                g_stkdepth = 0;
                free(data_buf);
                data_buf = (char *)newlsp;
        }

        if ((sort_buf = calloc(2 * (g_nrecs + 1),
            sizeof (void *))) == NULL)
                fail(1, "Sort buffer allocation failed");
        merge_buf = sort_buf + (g_nrecs + 1);

        /*
         * Build the sort buffer, discarding zero-count records along the way.
         */
        /* LINTED - alignment */
        for (i = 0, lsp = (lsrec_t *)data_buf; i < g_nrecs_used; i++,
            /* LINTED - alignment */
            lsp = (lsrec_t *)((char *)lsp + g_recsize)) {
                if (lsp->ls_count == 0)
                        lsp->ls_event = LS_MAX_EVENTS;
                sort_buf[i] = lsp;
        }

        if (g_nrecs_used == 0)
                exit(0);

        /*
         * Add a sentinel after the last record
         */
        sort_buf[i] = lsp;
        lsp->ls_event = LS_MAX_EVENTS;

        if (g_tracing) {
                report_trace(out, sort_buf);
                return (0);
        }

        /*
         * Application of -g may have resulted in multiple records
         * with the same signature; coalesce them.
         */
        if (g_gflag) {
                mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
                coalesce(lockcmp, sort_buf, g_nrecs_used);
        }

        /*
         * Coalesce locks within the same symbol if -c option specified.
         * Coalesce PCs within the same function if -k option specified.
         */
        if (g_cflag || g_kflag) {
                for (i = 0; i < g_nrecs_used; i++) {
                        int fr;
                        lsp = sort_buf[i];
                        if (g_cflag)
                                coalesce_symbol(&lsp->ls_lock);
                        if (g_kflag) {
                                for (fr = 0; fr < g_stkdepth; fr++)
                                        coalesce_symbol(&lsp->ls_stack[fr]);
                                coalesce_symbol(&lsp->ls_caller);
                        }
                }
                mergesort(lockcmp, sort_buf, merge_buf, g_nrecs_used);
                coalesce(lockcmp, sort_buf, g_nrecs_used);
        }

        /*
         * Coalesce callers if -w option specified
         */
        if (g_wflag) {
                mergesort(lock_and_count_cmp_anywhere,
                    sort_buf, merge_buf, g_nrecs_used);
                coalesce(lockcmp_anywhere, sort_buf, g_nrecs_used);
        }

        /*
         * Coalesce locks if -W option specified
         */
        if (g_Wflag) {
                mergesort(site_and_count_cmp_anylock,
                    sort_buf, merge_buf, g_nrecs_used);
                coalesce(sitecmp_anylock, sort_buf, g_nrecs_used);
        }

        /*
         * Sort data by contention count (ls_count) or total time (ls_time),
         * depending on g_Pflag.  Override g_Pflag if time wasn't measured.
         */
        if (g_recsize < LS_TIME)
                g_Pflag = 0;

        if (g_Pflag)
                mergesort(timecmp, sort_buf, merge_buf, g_nrecs_used);
        else
                mergesort(countcmp, sort_buf, merge_buf, g_nrecs_used);

        /*
         * Display data by event type
         */
        first = &sort_buf[0];
        while ((event = (*first)->ls_event) < LS_MAX_EVENTS) {
                current = first;
                while ((lsp = *current)->ls_event == event)
                        current++;
                report_stats(out, first, current - first, ev_count[event],
                    ev_time[event]);
                first = current;
        }

        return (0);
}

static char *
format_symbol(char *buf, uintptr_t addr, int show_size)
{
        uintptr_t symoff;
        char *symname;
        size_t symsize;

        symname = addr_to_sym(addr, &symoff, &symsize);

        if (show_size && symoff == 0)
                (void) sprintf(buf, "%s[%ld]", symname, (long)symsize);
        else if (symoff == 0)
                (void) sprintf(buf, "%s", symname);
        else if (symoff < 16 && bcmp(symname, "cpu[", 4) == 0)  /* CPU+PIL */
                (void) sprintf(buf, "%s+%ld", symname, (long)symoff);
        else if (symoff <= symsize || (symoff < 256 && addr != symoff))
                (void) sprintf(buf, "%s+0x%llx", symname,
                    (unsigned long long)symoff);
        else
                (void) sprintf(buf, "0x%llx", (unsigned long long)addr);
        return (buf);
}

static void
report_stats(FILE *out, lsrec_t **sort_buf, size_t nrecs, uint64_t total_count,
    uint64_t total_time)
{
        uint32_t event = sort_buf[0]->ls_event;
        lsrec_t *lsp;
        double ptotal = 0.0;
        double percent;
        int i, j, fr;
        int displayed;
        int first_bin, last_bin, max_bin_count, total_bin_count;
        int rectype;
        char buf[256];
        char lhdr[80], chdr[80];

        rectype = g_recsize;

        if (g_topn == 0) {
                (void) fprintf(out, "%20llu %s\n",
                    g_rates == 0 ? total_count :
                    ((unsigned long long)total_count * NANOSEC) / g_elapsed,
                    g_event_info[event].ev_desc);
                return;
        }

        (void) sprintf(lhdr, "%s%s",
            g_Wflag ? "Hottest " : "", g_event_info[event].ev_lhdr);
        (void) sprintf(chdr, "%s%s",
            g_wflag ? "Hottest " : "", "Caller");

        if (!g_pflag)
                (void) fprintf(out,
                    "\n%s: %.0f events in %.3f seconds (%.0f events/sec)\n\n",
                    g_event_info[event].ev_desc, (double)total_count,
                    (double)g_elapsed / NANOSEC,
                    (double)total_count * NANOSEC / g_elapsed);

        if (!g_pflag && rectype < LS_HIST) {
                (void) sprintf(buf, "%s", g_event_info[event].ev_units);
                (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
                    g_rates ? "ops/s" : "Count",
                    g_gflag ? "genr" : "indv",
                    "cuml", "rcnt", rectype >= LS_TIME ? buf : "", lhdr, chdr);
                (void) fprintf(out, "---------------------------------"
                    "----------------------------------------------\n");
        }

        displayed = 0;
        for (i = 0; i < nrecs; i++) {
                lsp = sort_buf[i];

                if (displayed++ >= g_topn)
                        break;

                if (g_pflag) {
                        int j;

                        (void) fprintf(out, "%u %u",
                            lsp->ls_event, lsp->ls_count);
                        (void) fprintf(out, " %s",
                            format_symbol(buf, lsp->ls_lock, g_cflag));
                        (void) fprintf(out, " %s",
                            format_symbol(buf, lsp->ls_caller, 0));
                        (void) fprintf(out, " %f",
                            (double)lsp->ls_refcnt / lsp->ls_count);
                        if (rectype >= LS_TIME)
                                (void) fprintf(out, " %llu",
                                    (unsigned long long)lsp->ls_time);
                        if (rectype >= LS_HIST) {
                                for (j = 0; j < 64; j++)
                                        (void) fprintf(out, " %u",
                                            lsp->ls_hist[j]);
                        }
                        for (j = 0; j < LS_MAX_STACK_DEPTH; j++) {
                                if (rectype <= LS_STACK(j) ||
                                    lsp->ls_stack[j] == 0)
                                        break;
                                (void) fprintf(out, " %s",
                                    format_symbol(buf, lsp->ls_stack[j], 0));
                        }
                        (void) fprintf(out, "\n");
                        continue;
                }

                if (rectype >= LS_HIST) {
                        (void) fprintf(out, "---------------------------------"
                            "----------------------------------------------\n");
                        (void) sprintf(buf, "%s",
                            g_event_info[event].ev_units);
                        (void) fprintf(out, "%5s %4s %4s %4s %8s %-22s %-24s\n",
                            g_rates ? "ops/s" : "Count",
                            g_gflag ? "genr" : "indv",
                            "cuml", "rcnt", buf, lhdr, chdr);
                }

                if (g_Pflag && total_time != 0)
                        percent = (lsp->ls_time * 100.00) / total_time;
                else
                        percent = (lsp->ls_count * 100.00) / total_count;

                ptotal += percent;

                if (rectype >= LS_TIME)
                        (void) sprintf(buf, "%llu",
                            (unsigned long long)(lsp->ls_time / lsp->ls_count));
                else
                        buf[0] = '\0';

                (void) fprintf(out, "%5llu ",
                    g_rates == 0 ? lsp->ls_count :
                    ((uint64_t)lsp->ls_count * NANOSEC) / g_elapsed);

                (void) fprintf(out, "%3.0f%% ", percent);

                if (g_gflag)
                        (void) fprintf(out, "---- ");
                else
                        (void) fprintf(out, "%3.0f%% ", ptotal);

                (void) fprintf(out, "%4.2f %8s ",
                    (double)lsp->ls_refcnt / lsp->ls_count, buf);

                (void) fprintf(out, "%-22s ",
                    format_symbol(buf, lsp->ls_lock, g_cflag));

                (void) fprintf(out, "%-24s\n",
                    format_symbol(buf, lsp->ls_caller, 0));

                if (rectype < LS_HIST)
                        continue;

                (void) fprintf(out, "\n");
                (void) fprintf(out, "%10s %31s %-9s %-24s\n",
                    g_event_info[event].ev_units,
                    "------ Time Distribution ------",
                    g_rates ? "ops/s" : "count",
                    rectype > LS_STACK(0) ? "Stack" : "");

                first_bin = 0;
                while (lsp->ls_hist[first_bin] == 0)
                        first_bin++;

                last_bin = 63;
                while (lsp->ls_hist[last_bin] == 0)
                        last_bin--;

                max_bin_count = 0;
                total_bin_count = 0;
                for (j = first_bin; j <= last_bin; j++) {
                        total_bin_count += lsp->ls_hist[j];
                        if (lsp->ls_hist[j] > max_bin_count)
                                max_bin_count = lsp->ls_hist[j];
                }

                /*
                 * If we went a few frames below the caller, ignore them
                 */
                for (fr = 3; fr > 0; fr--)
                        if (lsp->ls_stack[fr] == lsp->ls_caller)
                                break;

                for (j = first_bin; j <= last_bin; j++) {
                        uint_t depth = (lsp->ls_hist[j] * 30) / total_bin_count;
                        (void) fprintf(out, "%10llu |%s%s %-9u ",
                            1ULL << j,
                            "@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@" + 30 - depth,
                            "                              " + depth,
                            g_rates == 0 ? lsp->ls_hist[j] :
                            (uint_t)(((uint64_t)lsp->ls_hist[j] * NANOSEC) /
                            g_elapsed));
                        if (rectype <= LS_STACK(fr) || lsp->ls_stack[fr] == 0) {
                                (void) fprintf(out, "\n");
                                continue;
                        }
                        (void) fprintf(out, "%-24s\n",
                            format_symbol(buf, lsp->ls_stack[fr], 0));
                        fr++;
                }
                while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
                        (void) fprintf(out, "%15s %-36s %-24s\n", "", "",
                            format_symbol(buf, lsp->ls_stack[fr], 0));
                        fr++;
                }
        }

        if (!g_pflag)
                (void) fprintf(out, "---------------------------------"
                    "----------------------------------------------\n");

        (void) fflush(out);
}

static void
report_trace(FILE *out, lsrec_t **sort_buf)
{
        lsrec_t *lsp;
        int i, fr;
        int rectype;
        char buf[256], buf2[256];

        rectype = g_recsize;

        if (!g_pflag) {
                (void) fprintf(out, "%5s  %7s  %11s  %-24s  %-24s\n",
                    "Event", "Time", "Owner", "Lock", "Caller");
                (void) fprintf(out, "---------------------------------"
                    "----------------------------------------------\n");
        }

        for (i = 0; i < g_nrecs_used; i++) {

                lsp = sort_buf[i];

                if (lsp->ls_event >= LS_MAX_EVENTS || lsp->ls_count == 0)
                        continue;

                (void) fprintf(out, "%2d  %10llu  %11p  %-24s  %-24s\n",
                    lsp->ls_event, (unsigned long long)lsp->ls_time,
                    (void *)lsp->ls_next,
                    format_symbol(buf, lsp->ls_lock, 0),
                    format_symbol(buf2, lsp->ls_caller, 0));

                if (rectype <= LS_STACK(0))
                        continue;

                /*
                 * If we went a few frames below the caller, ignore them
                 */
                for (fr = 3; fr > 0; fr--)
                        if (lsp->ls_stack[fr] == lsp->ls_caller)
                                break;

                while (rectype > LS_STACK(fr) && lsp->ls_stack[fr] != 0) {
                        (void) fprintf(out, "%53s  %-24s\n", "",
                            format_symbol(buf, lsp->ls_stack[fr], 0));
                        fr++;
                }
                (void) fprintf(out, "\n");
        }

        (void) fflush(out);
}