root/sys/cddl/dev/profile/profile.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 *
 * Portions Copyright 2006-2008 John Birrell jb@freebsd.org
 *
 */

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
#include <sys/cpuvar.h>
#include <sys/endian.h>
#include <sys/fcntl.h>
#include <sys/filio.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/kthread.h>
#include <sys/limits.h>
#include <sys/linker.h>
#include <sys/lock.h>
#include <sys/malloc.h>
#include <sys/module.h>
#include <sys/mutex.h>
#include <sys/poll.h>
#include <sys/proc.h>
#include <sys/selinfo.h>
#include <sys/smp.h>
#include <sys/stdarg.h>
#include <sys/sysctl.h>
#include <sys/uio.h>
#include <sys/unistd.h>
#include <machine/cpu.h>

#include <sys/dtrace.h>
#include <sys/dtrace_bsd.h>

#include <cddl/dev/dtrace/dtrace_cddl.h>

#define PROF_NAMELEN            15

#define PROF_PROFILE            0
#define PROF_TICK               1
#define PROF_PREFIX_PROFILE     "profile-"
#define PROF_PREFIX_TICK        "tick-"

/*
 * Regardless of platform, there are five artificial frames in the case of the
 * profile provider:
 *
 *      profile_fire
 *      cyclic_expire
 *      cyclic_fire
 *      [ cbe ]
 *      [ locore ]
 *
 * On amd64, there are two frames associated with locore:  one in locore, and
 * another in common interrupt dispatch code.  (i386 has not been modified to
 * use this common layer.)  Further, on i386, the interrupted instruction
 * appears as its own stack frame.  All of this means that we need to add one
 * frame for amd64, and then take one away for both amd64 and i386.
 *
 * All of the above constraints lead to the mess below.  Yes, the profile
 * provider should ideally figure this out on-the-fly by hiting one of its own
 * probes and then walking its own stack trace.  This is complicated, however,
 * and the static definition doesn't seem to be overly brittle.  Still, we
 * allow for a manual override in case we get it completely wrong.
 */
#ifdef __amd64
#define PROF_ARTIFICIAL_FRAMES  10
#else
#ifdef __i386
#define PROF_ARTIFICIAL_FRAMES  6
#endif
#endif

#ifdef __powerpc__
/*
 * This value is bogus just to make module compilable on powerpc
 */
#define PROF_ARTIFICIAL_FRAMES  8
#endif

struct profile_probe_percpu;

#ifdef __arm__
#define PROF_ARTIFICIAL_FRAMES  3
#endif

#ifdef __aarch64__
#define PROF_ARTIFICIAL_FRAMES  12
#endif

#ifdef __riscv
#define PROF_ARTIFICIAL_FRAMES  12
#endif

typedef struct profile_probe {
        dtrace_id_t     prof_id;
        int             prof_kind;
#ifdef illumos
        hrtime_t        prof_interval;
        cyclic_id_t     prof_cyclic;
#else
        sbintime_t      prof_interval;
        struct callout  prof_cyclic;
        sbintime_t      prof_expected;
        struct profile_probe_percpu **prof_pcpus;
#endif
} profile_probe_t;

typedef struct profile_probe_percpu {
        hrtime_t        profc_expected;
        hrtime_t        profc_interval;
        profile_probe_t *profc_probe;
#ifdef __FreeBSD__
        struct callout  profc_cyclic;
#endif
} profile_probe_percpu_t;

static int      profile_unload(void);
static void     profile_create(hrtime_t, char *, int);
static void     profile_destroy(void *, dtrace_id_t, void *);
static void     profile_enable(void *, dtrace_id_t, void *);
static void     profile_disable(void *, dtrace_id_t, void *);
static void     profile_load(void *);
static void     profile_provide(void *, dtrace_probedesc_t *);

static int profile_rates[] = {
    97, 199, 499, 997, 1999,
    4001, 4999, 0, 0, 0,
    0, 0, 0, 0, 0,
    0, 0, 0, 0, 0
};

static int profile_ticks[] = {
    1, 10, 100, 500, 1000,
    5000, 0, 0, 0, 0,
    0, 0, 0, 0, 0
};

/*
 * profile_max defines the upper bound on the number of profile probes that
 * can exist (this is to prevent malicious or clumsy users from exhausing
 * system resources by creating a slew of profile probes). At mod load time,
 * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
 * present in the profile.conf file.
 */
#define PROFILE_MAX_DEFAULT     1000    /* default max. number of probes */
static uint32_t profile_max = PROFILE_MAX_DEFAULT;
                                        /* maximum number of profile probes */
static uint32_t profile_total;          /* current number of profile probes */

static dtrace_pattr_t profile_attr = {
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
};

static dtrace_pops_t profile_pops = {
        .dtps_provide =         profile_provide,
        .dtps_provide_module =  NULL,
        .dtps_enable =          profile_enable,
        .dtps_disable =         profile_disable,
        .dtps_suspend =         NULL,
        .dtps_resume =          NULL,
        .dtps_getargdesc =      NULL,
        .dtps_getargval =       NULL,
        .dtps_usermode =        NULL,
        .dtps_destroy =         profile_destroy
};

static dtrace_provider_id_t     profile_id;
static hrtime_t                 profile_interval_min = NANOSEC / 5000;  /* 5000 hz */
static int                      profile_aframes = PROF_ARTIFICIAL_FRAMES;

SYSCTL_DECL(_kern_dtrace);
SYSCTL_NODE(_kern_dtrace, OID_AUTO, profile, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
    "DTrace profile parameters");
SYSCTL_INT(_kern_dtrace_profile, OID_AUTO, aframes, CTLFLAG_RW, &profile_aframes,
    0, "Skipped frames for profile provider");

static sbintime_t
nsec_to_sbt(hrtime_t nsec)
{
        time_t sec;

        /*
         * We need to calculate nsec * 2^32 / 10^9
         * Seconds and nanoseconds are split to avoid overflow.
         */
        sec = nsec / NANOSEC;
        nsec = nsec % NANOSEC;
        return (((sbintime_t)sec << 32) | ((sbintime_t)nsec << 32) / NANOSEC);
}

static hrtime_t
sbt_to_nsec(sbintime_t sbt)
{

        return ((sbt >> 32) * NANOSEC +
            (((uint32_t)sbt * (hrtime_t)NANOSEC) >> 32));
}

static void
profile_probe(profile_probe_t *prof, hrtime_t late)
{
        struct thread *td;
        struct trapframe *frame;
        uintfptr_t pc, upc;

        td = curthread;
        pc = upc = 0;

        /*
         * td_intr_frame can be unset if this is a catch-up event upon waking up
         * from idle sleep. This can only happen on a CPU idle thread. Use a
         * representative arg0 value in this case so that one of the probe
         * arguments is non-zero.
         */
        frame = td->td_intr_frame;
        if (frame != NULL) {
                if (TRAPF_USERMODE(frame))
                        upc = TRAPF_PC(frame);
                else {
                        pc = TRAPF_PC(frame);
                        td->t_dtrace_trapframe = frame;
                }
        } else if (TD_IS_IDLETHREAD(td))
                pc = (uintfptr_t)&cpu_idle;

        dtrace_probe(prof->prof_id, pc, upc, late, 0, 0);
        td->t_dtrace_trapframe = NULL;
}

static void
profile_fire(void *arg)
{
        profile_probe_percpu_t *pcpu = arg;
        profile_probe_t *prof = pcpu->profc_probe;
        hrtime_t late;

        late = sbt_to_nsec(sbinuptime() - pcpu->profc_expected);

        profile_probe(prof, late);
        pcpu->profc_expected += pcpu->profc_interval;
        callout_schedule_sbt_curcpu(&pcpu->profc_cyclic,
            pcpu->profc_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
}

static void
profile_tick(void *arg)
{
        profile_probe_t *prof = arg;

        profile_probe(prof, 0);
        prof->prof_expected += prof->prof_interval;
        callout_schedule_sbt(&prof->prof_cyclic,
            prof->prof_expected, 0, C_DIRECT_EXEC | C_ABSOLUTE);
}

static void
profile_create(hrtime_t interval, char *name, int kind)
{
        profile_probe_t *prof;

        if (interval < profile_interval_min)
                return;

        if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
                return;

        atomic_add_32(&profile_total, 1);
        if (profile_total > profile_max) {
                atomic_add_32(&profile_total, -1);
                return;
        }

        prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
#ifdef illumos
        prof->prof_interval = interval;
        prof->prof_cyclic = CYCLIC_NONE;
#else
        prof->prof_interval = nsec_to_sbt(interval);
        callout_init(&prof->prof_cyclic, 1);
#endif
        prof->prof_kind = kind;
        prof->prof_id = dtrace_probe_create(profile_id,
            NULL, NULL, name,
            profile_aframes, prof);
}

/*ARGSUSED*/
static void
profile_provide(void *arg, dtrace_probedesc_t *desc)
{
        int i, j, rate, kind;
        hrtime_t val = 0, mult = 1, len = 0;
        char *name, *suffix = NULL;

        const struct {
                char *prefix;
                int kind;
        } types[] = {
                { PROF_PREFIX_PROFILE, PROF_PROFILE },
                { PROF_PREFIX_TICK, PROF_TICK },
                { 0, 0 }
        };

        const struct {
                char *name;
                hrtime_t mult;
        } suffixes[] = {
                { "ns",         NANOSEC / NANOSEC },
                { "nsec",       NANOSEC / NANOSEC },
                { "us",         NANOSEC / MICROSEC },
                { "usec",       NANOSEC / MICROSEC },
                { "ms",         NANOSEC / MILLISEC },
                { "msec",       NANOSEC / MILLISEC },
                { "s",          NANOSEC / SEC },
                { "sec",        NANOSEC / SEC },
                { "m",          NANOSEC * (hrtime_t)60 },
                { "min",        NANOSEC * (hrtime_t)60 },
                { "h",          NANOSEC * (hrtime_t)(60 * 60) },
                { "hour",       NANOSEC * (hrtime_t)(60 * 60) },
                { "d",          NANOSEC * (hrtime_t)(24 * 60 * 60) },
                { "day",        NANOSEC * (hrtime_t)(24 * 60 * 60) },
                { "hz",         0 },
                { NULL }
        };

        if (desc == NULL) {
                char n[PROF_NAMELEN];

                /*
                 * If no description was provided, provide all of our probes.
                 */
                for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
                        if ((rate = profile_rates[i]) == 0)
                                continue;

                        (void) snprintf(n, PROF_NAMELEN, "%s%d",
                            PROF_PREFIX_PROFILE, rate);
                        profile_create(NANOSEC / rate, n, PROF_PROFILE);
                }

                for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
                        if ((rate = profile_ticks[i]) == 0)
                                continue;

                        (void) snprintf(n, PROF_NAMELEN, "%s%d",
                            PROF_PREFIX_TICK, rate);
                        profile_create(NANOSEC / rate, n, PROF_TICK);
                }

                return;
        }

        name = desc->dtpd_name;

        for (i = 0; types[i].prefix != NULL; i++) {
                len = strlen(types[i].prefix);

                if (strncmp(name, types[i].prefix, len) != 0)
                        continue;
                break;
        }

        if (types[i].prefix == NULL)
                return;

        kind = types[i].kind;
        j = strlen(name) - len;

        /*
         * We need to start before any time suffix.
         */
        for (j = strlen(name); j >= len; j--) {
                if (name[j] >= '0' && name[j] <= '9')
                        break;
                suffix = &name[j];
        }

        ASSERT(suffix != NULL);

        /*
         * Now determine the numerical value present in the probe name.
         */
        for (; j >= len; j--) {
                if (name[j] < '0' || name[j] > '9')
                        return;

                val += (name[j] - '0') * mult;
                mult *= (hrtime_t)10;
        }

        if (val == 0)
                return;

        /*
         * Look-up the suffix to determine the multiplier.
         */
        for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
                if (strcasecmp(suffixes[i].name, suffix) == 0) {
                        mult = suffixes[i].mult;
                        break;
                }
        }

        if (suffixes[i].name == NULL && *suffix != '\0')
                return;

        if (mult == 0) {
                /*
                 * The default is frequency-per-second.
                 */
                val = NANOSEC / val;
        } else {
                val *= mult;
        }

        profile_create(val, name, kind);
}

/* ARGSUSED */
static void
profile_destroy(void *arg, dtrace_id_t id, void *parg)
{
        profile_probe_t *prof = parg;

#ifdef illumos
        ASSERT(prof->prof_cyclic == CYCLIC_NONE);
#else
        ASSERT(!callout_active(&prof->prof_cyclic) && prof->prof_pcpus == NULL);
#endif
        kmem_free(prof, sizeof (profile_probe_t));

        ASSERT(profile_total >= 1);
        atomic_add_32(&profile_total, -1);
}

#ifdef illumos
/*ARGSUSED*/
static void
profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
{
        profile_probe_t *prof = arg;
        profile_probe_percpu_t *pcpu;

        pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
        pcpu->profc_probe = prof;

        hdlr->cyh_func = profile_fire;
        hdlr->cyh_arg = pcpu;

        when->cyt_interval = prof->prof_interval;
        when->cyt_when = gethrtime() + when->cyt_interval;

        pcpu->profc_expected = when->cyt_when;
        pcpu->profc_interval = when->cyt_interval;
}

/*ARGSUSED*/
static void
profile_offline(void *arg, cpu_t *cpu, void *oarg)
{
        profile_probe_percpu_t *pcpu = oarg;

        ASSERT(pcpu->profc_probe == arg);
        kmem_free(pcpu, sizeof (profile_probe_percpu_t));
}

/* ARGSUSED */
static void
profile_enable(void *arg, dtrace_id_t id, void *parg)
{
        profile_probe_t *prof = parg;
        cyc_omni_handler_t omni;
        cyc_handler_t hdlr;
        cyc_time_t when;

        ASSERT(prof->prof_interval != 0);
        ASSERT(MUTEX_HELD(&cpu_lock));

        if (prof->prof_kind == PROF_TICK) {
                hdlr.cyh_func = profile_tick;
                hdlr.cyh_arg = prof;

                when.cyt_interval = prof->prof_interval;
                when.cyt_when = gethrtime() + when.cyt_interval;
        } else {
                ASSERT(prof->prof_kind == PROF_PROFILE);
                omni.cyo_online = profile_online;
                omni.cyo_offline = profile_offline;
                omni.cyo_arg = prof;
        }

        if (prof->prof_kind == PROF_TICK) {
                prof->prof_cyclic = cyclic_add(&hdlr, &when);
        } else {
                prof->prof_cyclic = cyclic_add_omni(&omni);
        }
}

/* ARGSUSED */
static void
profile_disable(void *arg, dtrace_id_t id, void *parg)
{
        profile_probe_t *prof = parg;

        ASSERT(prof->prof_cyclic != CYCLIC_NONE);
        ASSERT(MUTEX_HELD(&cpu_lock));

        cyclic_remove(prof->prof_cyclic);
        prof->prof_cyclic = CYCLIC_NONE;
}

#else

static void
profile_enable_omni(profile_probe_t *prof)
{
        profile_probe_percpu_t *pcpu;
        int cpu;

        prof->prof_pcpus = kmem_zalloc((mp_maxid + 1) * sizeof(pcpu), KM_SLEEP);
        CPU_FOREACH(cpu) {
                pcpu = kmem_zalloc(sizeof(profile_probe_percpu_t), KM_SLEEP);
                prof->prof_pcpus[cpu] = pcpu;
                pcpu->profc_probe = prof;
                pcpu->profc_expected = sbinuptime() + prof->prof_interval;
                pcpu->profc_interval = prof->prof_interval;
                callout_init(&pcpu->profc_cyclic, 1);
                callout_reset_sbt_on(&pcpu->profc_cyclic,
                    pcpu->profc_expected, 0, profile_fire, pcpu,
                    cpu, C_DIRECT_EXEC | C_ABSOLUTE);
        }
}

static void
profile_disable_omni(profile_probe_t *prof)
{
        profile_probe_percpu_t *pcpu;
        int cpu;

        ASSERT(prof->prof_pcpus != NULL);
        CPU_FOREACH(cpu) {
                pcpu = prof->prof_pcpus[cpu];
                ASSERT(pcpu->profc_probe == prof);
                ASSERT(callout_active(&pcpu->profc_cyclic));
                callout_stop(&pcpu->profc_cyclic);
                callout_drain(&pcpu->profc_cyclic);
                kmem_free(pcpu, sizeof(profile_probe_percpu_t));
        }
        kmem_free(prof->prof_pcpus, (mp_maxid + 1) * sizeof(pcpu));
        prof->prof_pcpus = NULL;
}

/* ARGSUSED */
static void
profile_enable(void *arg, dtrace_id_t id, void *parg)
{
        profile_probe_t *prof = parg;

        if (prof->prof_kind == PROF_TICK) {
                prof->prof_expected = sbinuptime() + prof->prof_interval;
                callout_reset_sbt(&prof->prof_cyclic,
                    prof->prof_expected, 0, profile_tick, prof,
                    C_DIRECT_EXEC | C_ABSOLUTE);
        } else {
                ASSERT(prof->prof_kind == PROF_PROFILE);
                profile_enable_omni(prof);
        }
}

/* ARGSUSED */
static void
profile_disable(void *arg, dtrace_id_t id, void *parg)
{
        profile_probe_t *prof = parg;

        if (prof->prof_kind == PROF_TICK) {
                ASSERT(callout_active(&prof->prof_cyclic));
                callout_stop(&prof->prof_cyclic);
                callout_drain(&prof->prof_cyclic);
        } else {
                ASSERT(prof->prof_kind == PROF_PROFILE);
                profile_disable_omni(prof);
        }
}
#endif

static void
profile_load(void *dummy)
{
        if (dtrace_register("profile", &profile_attr, DTRACE_PRIV_USER,
            NULL, &profile_pops, NULL, &profile_id) != 0)
                return;
}


static int
profile_unload(void)
{
        int error = 0;

        if ((error = dtrace_unregister(profile_id)) != 0)
                return (error);

        return (error);
}

/* ARGSUSED */
static int
profile_modevent(module_t mod __unused, int type, void *data __unused)
{
        int error = 0;

        switch (type) {
        case MOD_LOAD:
                break;

        case MOD_UNLOAD:
                break;

        case MOD_SHUTDOWN:
                break;

        default:
                error = EOPNOTSUPP;
                break;

        }
        return (error);
}

SYSINIT(profile_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_load, NULL);
SYSUNINIT(profile_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, profile_unload, NULL);

DEV_MODULE(profile, profile_modevent, NULL);
MODULE_VERSION(profile, 1);
MODULE_DEPEND(profile, dtrace, 1, 1, 1);
MODULE_DEPEND(profile, opensolaris, 1, 1, 1);