root/usr/src/uts/common/os/cap_util.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

/*
 * Support for determining capacity and utilization of performance relevant
 * hardware components in a computer
 *
 * THEORY
 * ------
 * The capacity and utilization of the performance relevant hardware components
 * is needed to be able to optimize performance while minimizing the amount of
 * power used on a system.  The idea is to use hardware performance counters
 * and potentially other means to determine the capacity and utilization of
 * performance relevant hardware components (eg. execution pipeline, cache,
 * memory, etc.) and attribute the utilization to the responsible CPU and the
 * thread running there.
 *
 * This will help characterize the utilization of performance relevant
 * components and how much is used by each CPU and each thread.  With
 * that data, the utilization can be aggregated to all the CPUs sharing each
 * performance relevant hardware component to calculate the total utilization
 * of each component and compare that with the component's capacity to
 * essentially determine the actual hardware load of the component.  The
 * hardware utilization attributed to each running thread can also be
 * aggregated to determine the total hardware utilization of each component to
 * a workload.
 *
 * Once that is done, one can determine how much of each performance relevant
 * hardware component is needed by a given thread or set of threads (eg. a
 * workload) and size up exactly what hardware is needed by the threads and how
 * much.  With this info, we can better place threads among CPUs to match their
 * exact hardware resource needs and potentially lower or raise the power based
 * on their utilization or pack threads onto the fewest hardware components
 * needed and power off any remaining unused components to minimize power
 * without sacrificing performance.
 *
 * IMPLEMENTATION
 * --------------
 * The code has been designed and implemented to make (un)programming and
 * reading the counters for a given CPU as lightweight and fast as possible.
 * This is very important because we need to read and potentially (un)program
 * the counters very often and in performance sensitive code.  Specifically,
 * the counters may need to be (un)programmed during context switch and/or a
 * cyclic handler when there are more counter events to count than existing
 * counters.
 *
 * Consequently, the code has been split up to allow allocating and
 * initializing everything needed to program and read the counters on a given
 * CPU once and make (un)programming and reading the counters for a given CPU
 * not have to allocate/free memory or grab any locks.  To do this, all the
 * state needed to (un)program and read the counters on a CPU is kept per CPU
 * and is made lock free by forcing any code that reads or manipulates the
 * counters or the state needed to (un)program or read the counters to run on
 * the target CPU and disable preemption while running on the target CPU to
 * protect any critical sections. All counter manipulation on the target CPU is
 * happening either from a cross-call to the target CPU or at the same PIL as
 * used by the cross-call subsystem. This guarantees that counter manipulation
 * is not interrupted by cross-calls from other CPUs.
 *
 * The synchronization has been made lock free or as simple as possible for
 * performance and to avoid getting the locking all tangled up when we interpose
 * on the CPC routines that (un)program the counters to manage the counters
 * between the kernel and user on each CPU.  When the user starts using the
 * counters on a given CPU, the kernel will unprogram the counters that it is
 * using on that CPU just before they are programmed for the user.  Then the
 * kernel will program the counters on a given CPU for its own use when the user
 * stops using them.
 *
 * There is a special interaction with DTrace cpc provider (dcpc). Before dcpc
 * enables any probe, it requests to disable and unprogram all counters used for
 * capacity and utilizations. These counters are never re-programmed back until
 * dcpc completes. When all DTrace cpc probes are removed, dcpc notifies CU
 * framework and it re-programs the counters.
 *
 * When a CPU is going offline, its CU counters are unprogrammed and disabled,
 * so that they would not be re-programmed again by some other activity on the
 * CPU that is going offline.
 *
 * The counters are programmed during boot.  However, a flag is available to
 * disable this if necessary (see cu_flag below).  A handler is provided to
 * (un)program the counters during CPU on/offline.  Basic routines are provided
 * to initialize and tear down this module, initialize and tear down any state
 * needed for a given CPU, and (un)program the counters for a given CPU.
 * Lastly, a handler is provided to read the counters and attribute the
 * utilization to the responsible CPU.
 */
#include <sys/types.h>
#include <sys/cmn_err.h>
#include <sys/cpuvar.h>
#include <sys/ddi.h>
#include <sys/systm.h>
#include <sys/disp.h>
#include <sys/sdt.h>
#include <sys/sunddi.h>
#include <sys/thread.h>
#include <sys/pghw.h>
#include <sys/cmt.h>
#include <sys/policy.h>
#include <sys/x_call.h>
#include <sys/cap_util.h>

#include <sys/archsystm.h>
#include <sys/promif.h>

#if defined(__x86)
#include <sys/xc_levels.h>
#endif


/*
 * Default CPU hardware performance counter flags to use for measuring capacity
 * and utilization
 */
#define CU_CPC_FLAGS_DEFAULT    \
        (CPC_COUNT_USER|CPC_COUNT_SYSTEM|CPC_OVF_NOTIFY_EMT)

/*
 * Possible Flags for controlling this module.
 */
#define CU_FLAG_ENABLE          1       /* Enable module */
#define CU_FLAG_READY           2       /* Ready to setup module */
#define CU_FLAG_ON              4       /* Module is on */

/*
 * pg_cpu kstats calculate utilization rate and maximum utilization rate for
 * some CPUs. The rate is calculated based on data from two subsequent
 * snapshots. When the time between such two snapshots is too small, the
 * resulting rate may have low accuracy, so we only consider snapshots which
 * are separated by SAMPLE_INTERVAL nanoseconds from one another. We do not
 * update the rate if the interval is smaller than that.
 *
 * Use one tenth of a second as the minimum interval for utilization rate
 * calculation.
 *
 * NOTE: The CU_SAMPLE_INTERVAL_MIN should be higher than the scaling factor in
 * the CU_RATE() macro below to guarantee that we never divide by zero.
 *
 * Rate is the number of events per second. The rate is the number of events
 * divided by time and multiplied by the number of nanoseconds in a second. We
 * do not want time to be too small since it will cause large errors in
 * division.
 *
 * We do not want to multiply two large numbers (the instruction count and
 * NANOSEC) either since it may cause integer overflow. So we divide both the
 * numerator and the denominator by the same value.
 *
 * NOTE: The scaling factor below should be less than CU_SAMPLE_INTERVAL_MIN
 * above to guarantee that time divided by this value is always non-zero.
 */
#define CU_RATE(val, time) \
        (((val) * (NANOSEC / CU_SCALE)) / ((time) / CU_SCALE))

#define CU_SAMPLE_INTERVAL_MIN  (NANOSEC / 10)

#define CU_SCALE (CU_SAMPLE_INTERVAL_MIN / 10000)

/*
 * When the time between two kstat reads for the same CPU is less than
 * CU_UPDATE_THRESHOLD use the old counter data and skip updating counter values
 * for the CPU. This helps reduce cross-calls when kstat consumers read data
 * very often or when they read PG utilization data and then CPU utilization
 * data quickly after that.
 */
#define CU_UPDATE_THRESHOLD (NANOSEC / 10)

/*
 * The IS_HIPIL() macro verifies that the code is executed either from a
 * cross-call or from high-PIL interrupt
 */
#ifdef DEBUG
#define IS_HIPIL() (getpil() >= XCALL_PIL)
#else
#define IS_HIPIL()
#endif  /* DEBUG */


typedef void (*cu_cpu_func_t)(uintptr_t, int *);


/*
 * Flags to use for programming CPU hardware performance counters to measure
 * capacity and utilization
 */
int                             cu_cpc_flags = CU_CPC_FLAGS_DEFAULT;

/*
 * Initial value used for programming hardware counters
 */
uint64_t                        cu_cpc_preset_value = 0;

/*
 * List of CPC event requests for capacity and utilization.
 */
static kcpc_request_list_t      *cu_cpc_reqs = NULL;

/*
 * When a CPU is a member of PG with a sharing relationship that is supported
 * by the capacity/utilization framework, a kstat is created for that CPU and
 * sharing relationship.
 *
 * These kstats are updated one at a time, so we can have a single scratch
 * space to fill the data.
 *
 * CPU counter kstats fields:
 *
 *   cu_cpu_id          CPU ID for this kstat
 *
 *   cu_pg_id           PG ID for this kstat
 *
 *   cu_generation      Generation value that increases whenever any CPU goes
 *                        offline or online. Two kstat snapshots for the same
 *                        CPU may only be compared if they have the same
 *                        generation.
 *
 *   cu_pg_id           PG ID for the relationship described by this kstat
 *
 *   cu_cpu_util        Running value of CPU utilization for the sharing
 *                        relationship
 *
 *   cu_cpu_time_running Total time spent collecting CU data. The time may be
 *                         less than wall time if CU counters were stopped for
 *                         some time.
 *
 *   cu_cpu_time_stopped Total time the CU counters were stopped.
 *
 *   cu_cpu_rate        Utilization rate, expressed in operations per second.
 *
 *   cu_cpu_rate_max    Maximum observed value of utilization rate.
 *
 *   cu_cpu_relationship Name of sharing relationship for the PG in this kstat
 */
struct cu_cpu_kstat {
        kstat_named_t   cu_cpu_id;
        kstat_named_t   cu_pg_id;
        kstat_named_t   cu_generation;
        kstat_named_t   cu_cpu_util;
        kstat_named_t   cu_cpu_time_running;
        kstat_named_t   cu_cpu_time_stopped;
        kstat_named_t   cu_cpu_rate;
        kstat_named_t   cu_cpu_rate_max;
        kstat_named_t   cu_cpu_relationship;
} cu_cpu_kstat = {
        { "cpu_id",                     KSTAT_DATA_UINT32 },
        { "pg_id",                      KSTAT_DATA_INT32 },
        { "generation",                 KSTAT_DATA_UINT32 },
        { "hw_util",                    KSTAT_DATA_UINT64 },
        { "hw_util_time_running",       KSTAT_DATA_UINT64 },
        { "hw_util_time_stopped",       KSTAT_DATA_UINT64 },
        { "hw_util_rate",               KSTAT_DATA_UINT64 },
        { "hw_util_rate_max",           KSTAT_DATA_UINT64 },
        { "relationship",               KSTAT_DATA_STRING },
};

/*
 * Flags for controlling this module
 */
uint_t                          cu_flags = CU_FLAG_ENABLE;

/*
 * Error return value for cu_init() since it can't return anything to be called
 * from mp_init_tbl[] (:-(
 */
static int                      cu_init_error = 0;

hrtime_t                        cu_sample_interval_min = CU_SAMPLE_INTERVAL_MIN;

hrtime_t                        cu_update_threshold = CU_UPDATE_THRESHOLD;

static kmutex_t                 pg_cpu_kstat_lock;


/*
 * Forward declaration of interface routines
 */
void            cu_disable(void);
void            cu_enable(void);
void            cu_init(void);
void            cu_cpc_program(cpu_t *cp, int *err);
void            cu_cpc_unprogram(cpu_t *cp, int *err);
int             cu_cpu_update(struct cpu *cp, boolean_t move_to);
void            cu_pg_update(pghw_t *pg);


/*
 * Forward declaration of private routines
 */
static int      cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs);
static void     cu_cpc_program_xcall(uintptr_t arg, int *err);
static int      cu_cpc_req_add(char *event, kcpc_request_list_t *reqs,
    int nreqs, cu_cntr_stats_t *stats, int kmem_flags, int *nevents);
static int      cu_cpu_callback(cpu_setup_t what, int id, void *arg);
static void     cu_cpu_disable(cpu_t *cp);
static void     cu_cpu_enable(cpu_t *cp);
static int      cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs);
static int      cu_cpu_fini(cpu_t *cp);
static void     cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info);
static int      cu_cpu_kstat_update(kstat_t *ksp, int rw);
static int      cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg);
static int      cu_cpu_update_stats(cu_cntr_stats_t *stats,
    uint64_t cntr_value);
static void cu_cpu_info_detach_xcall(void);

/*
 * Disable or enable Capacity Utilization counters on all CPUs.
 */
void
cu_disable(void)
{
        cpu_t *cp;

        ASSERT(MUTEX_HELD(&cpu_lock));

        cp = cpu_active;
        do {
                if (!(cp->cpu_flags & CPU_OFFLINE))
                        cu_cpu_disable(cp);
        } while ((cp = cp->cpu_next_onln) != cpu_active);
}


void
cu_enable(void)
{
        cpu_t *cp;

        ASSERT(MUTEX_HELD(&cpu_lock));

        cp = cpu_active;
        do {
                if (!(cp->cpu_flags & CPU_OFFLINE))
                        cu_cpu_enable(cp);
        } while ((cp = cp->cpu_next_onln) != cpu_active);
}


/*
 * Setup capacity and utilization support
 */
void
cu_init(void)
{
        cpu_t   *cp;

        cu_init_error = 0;
        if (!(cu_flags & CU_FLAG_ENABLE) || (cu_flags & CU_FLAG_ON)) {
                cu_init_error = -1;
                return;
        }

        if (kcpc_init() != 0) {
                cu_init_error = -2;
                return;
        }

        /*
         * Can't measure hardware capacity and utilization without CPU
         * hardware performance counters
         */
        if (cpc_ncounters <= 0) {
                cu_init_error = -3;
                return;
        }

        /*
         * Setup CPC event request queue
         */
        cu_cpc_reqs = kcpc_reqs_init(cpc_ncounters, KM_SLEEP);

        mutex_enter(&cpu_lock);

        /*
         * Mark flags to say that module is ready to be setup
         */
        cu_flags |= CU_FLAG_READY;

        cp = cpu_active;
        do {
                /*
                 * Allocate and setup state needed to measure capacity and
                 * utilization
                 */
                if (cu_cpu_init(cp, cu_cpc_reqs) != 0)
                        cu_init_error = -5;

                /*
                 * Reset list of counter event requests so its space can be
                 * reused for a different set of requests for next CPU
                 */
                (void) kcpc_reqs_reset(cu_cpc_reqs);

                cp = cp->cpu_next_onln;
        } while (cp != cpu_active);

        /*
         * Mark flags to say that module is on now and counters are ready to be
         * programmed on all active CPUs
         */
        cu_flags |= CU_FLAG_ON;

        /*
         * Program counters on currently active CPUs
         */
        cp = cpu_active;
        do {
                if (cu_cpu_run(cp, cu_cpc_program_xcall,
                    (uintptr_t)B_FALSE) != 0)
                        cu_init_error = -6;

                cp = cp->cpu_next_onln;
        } while (cp != cpu_active);

        /*
         * Register callback for CPU state changes to enable and disable
         * CPC counters as CPUs come on and offline
         */
        register_cpu_setup_func(cu_cpu_callback, NULL);

        mutex_exit(&cpu_lock);
}


/*
 * Return number of counter events needed to measure capacity and utilization
 * for specified CPU and fill in list of CPC requests with each counter event
 * needed if list where to add CPC requests is given
 *
 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
 *       everything that has been successfully allocated if any memory
 *       allocation fails
 */
static int
cu_cpc_init(cpu_t *cp, kcpc_request_list_t *reqs, int nreqs)
{
        group_t         *cmt_pgs;
        cu_cntr_info_t  **cntr_info_array;
        cpu_pg_t        *cpu_pgs;
        cu_cpu_info_t   *cu_cpu_info;
        pg_cmt_t        *pg_cmt;
        pghw_t          *pg_hw;
        cu_cntr_stats_t *stats;
        int             nevents;
        pghw_type_t     pg_hw_type;
        group_iter_t    iter;

        ASSERT(MUTEX_HELD(&cpu_lock));

        /*
         * There has to be a target CPU for this
         */
        if (cp == NULL)
                return (-1);

        /*
         * Return 0 when CPU doesn't belong to any group
         */
        cpu_pgs = cp->cpu_pg;
        if (cpu_pgs == NULL || GROUP_SIZE(&cpu_pgs->cmt_pgs) < 1)
                return (0);

        cmt_pgs = &cpu_pgs->cmt_pgs;
        cu_cpu_info = cp->cpu_cu_info;

        /*
         * Grab counter statistics and info
         */
        if (reqs == NULL) {
                stats = NULL;
                cntr_info_array = NULL;
        } else {
                if (cu_cpu_info == NULL || cu_cpu_info->cu_cntr_stats == NULL)
                        return (-2);

                stats = cu_cpu_info->cu_cntr_stats;
                cntr_info_array = cu_cpu_info->cu_cntr_info;
        }

        /*
         * See whether platform (or processor) specific code knows which CPC
         * events to request, etc. are needed to measure hardware capacity and
         * utilization on this machine
         */
        nevents = cu_plat_cpc_init(cp, reqs, nreqs);
        if (nevents >= 0)
                return (nevents);

        /*
         * Let common code decide which CPC events to request, etc. to measure
         * capacity and utilization since platform (or processor) specific does
         * not know....
         *
         * Walk CPU's PG lineage and do following:
         *
         * - Setup CPC request, counter info, and stats needed for each counter
         *   event to measure capacity and and utilization for each of CPU's PG
         *   hardware sharing relationships
         *
         * - Create PG CPU kstats to export capacity and utilization for each PG
         */
        nevents = 0;
        group_iter_init(&iter);
        while ((pg_cmt = group_iterate(cmt_pgs, &iter)) != NULL) {
                cu_cntr_info_t  *cntr_info;
                int             nevents_save;
                int             nstats;

                pg_hw = (pghw_t *)pg_cmt;
                pg_hw_type = pg_hw->pghw_hw;
                nevents_save = nevents;
                nstats = 0;

                switch (pg_hw_type) {
                case PGHW_IPIPE:
                        if (cu_cpc_req_add("PAPI_tot_ins", reqs, nreqs, stats,
                            KM_NOSLEEP, &nevents) != 0)
                                continue;
                        nstats = 1;
                        break;

                case PGHW_FPU:
                        if (cu_cpc_req_add("PAPI_fp_ins", reqs, nreqs, stats,
                            KM_NOSLEEP, &nevents) != 0)
                                continue;
                        nstats = 1;
                        break;

                default:
                        /*
                         * Don't measure capacity and utilization for this kind
                         * of PG hardware relationship so skip to next PG in
                         * CPU's PG lineage
                         */
                        continue;
                }

                cntr_info = cntr_info_array[pg_hw_type];

                /*
                 * Nothing to measure for this hardware sharing relationship
                 */
                if (nevents - nevents_save == 0) {
                        if (cntr_info != NULL) {
                                kmem_free(cntr_info, sizeof (cu_cntr_info_t));
                                cntr_info_array[pg_hw_type] = NULL;
                        }
                        continue;
                }

                /*
                 * Fill in counter info for this PG hardware relationship
                 */
                if (cntr_info == NULL) {
                        cntr_info = kmem_zalloc(sizeof (cu_cntr_info_t),
                            KM_NOSLEEP);
                        if (cntr_info == NULL)
                                continue;
                        cntr_info_array[pg_hw_type] = cntr_info;
                }
                cntr_info->ci_cpu = cp;
                cntr_info->ci_pg = pg_hw;
                cntr_info->ci_stats = &stats[nevents_save];
                cntr_info->ci_nstats = nstats;

                /*
                 * Create PG CPU kstats for this hardware relationship
                 */
                cu_cpu_kstat_create(pg_hw, cntr_info);
        }

        return (nevents);
}


/*
 * Program counters for capacity and utilization on given CPU
 *
 * If any of the following conditions is true, the counters are not programmed:
 *
 * - CU framework is disabled
 * - The cpu_cu_info field of the cpu structure is NULL
 * - DTrace is active
 * - Counters are programmed already
 * - Counters are disabled (by calls to cu_cpu_disable())
 */
void
cu_cpc_program(cpu_t *cp, int *err)
{
        cu_cpc_ctx_t    *cpu_ctx;
        kcpc_ctx_t      *ctx;
        cu_cpu_info_t   *cu_cpu_info;

        ASSERT(IS_HIPIL());
        /*
         * Should be running on given CPU. We disable preemption to keep CPU
         * from disappearing and make sure flags and CPC context don't change
         * from underneath us
         */
        kpreempt_disable();
        ASSERT(cp == CPU);

        /*
         * Module not ready to program counters
         */
        if (!(cu_flags & CU_FLAG_ON)) {
                *err = -1;
                kpreempt_enable();
                return;
        }

        if (cp == NULL) {
                *err = -2;
                kpreempt_enable();
                return;
        }

        cu_cpu_info = cp->cpu_cu_info;
        if (cu_cpu_info == NULL) {
                *err = -3;
                kpreempt_enable();
                return;
        }

        /*
         * If DTrace CPC is active or counters turned on already or are
         * disabled, just return.
         */
        if (dtrace_cpc_in_use || (cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON) ||
            cu_cpu_info->cu_disabled) {
                *err = 1;
                kpreempt_enable();
                return;
        }

        if ((CPU->cpu_cpc_ctx != NULL) &&
            !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
                *err = -4;
                kpreempt_enable();
                return;
        }

        /*
         * Get CPU's CPC context needed for capacity and utilization
         */
        cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
        ASSERT(cpu_ctx != NULL);
        ASSERT(cpu_ctx->nctx >= 0);

        ASSERT(cpu_ctx->ctx_ptr_array == NULL || cpu_ctx->ctx_ptr_array_sz > 0);
        ASSERT(cpu_ctx->nctx <= cpu_ctx->ctx_ptr_array_sz);
        if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
            cpu_ctx->ctx_ptr_array_sz <= 0) {
                *err = -5;
                kpreempt_enable();
                return;
        }

        /*
         * Increment index in CPU's CPC context info to point at next context
         * to program
         *
         * NOTE: Do this now instead of after programming counters to ensure
         *       that index will always point at *current* context so we will
         *       always be able to unprogram *current* context if necessary
         */
        cpu_ctx->cur_index = (cpu_ctx->cur_index + 1) % cpu_ctx->nctx;

        ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];

        /*
         * Clear KCPC_CTX_INVALID and KCPC_CTX_INVALID_STOPPED from CPU's CPC
         * context before programming counters
         *
         * Context is marked with KCPC_CTX_INVALID_STOPPED when context is
         * unprogrammed and may be marked with KCPC_CTX_INVALID when
         * kcpc_invalidate_all() is called by cpustat(8) and dtrace CPC to
         * invalidate all CPC contexts before they take over all the counters.
         *
         * This isn't necessary since these flags are only used for thread bound
         * CPC contexts not CPU bound CPC contexts like ones used for capacity
         * and utilization.
         *
         * There is no need to protect the flag update since no one is using
         * this context now.
         */
        ctx->kc_flags &= ~(KCPC_CTX_INVALID | KCPC_CTX_INVALID_STOPPED);

        /*
         * Program counters on this CPU
         */
        kcpc_program(ctx, B_FALSE, B_FALSE);

        cp->cpu_cpc_ctx = ctx;

        /*
         * Set state in CPU structure to say that CPU's counters are programmed
         * for capacity and utilization now and that they are transitioning from
         * off to on state. This will cause cu_cpu_update to update stop times
         * for all programmed counters.
         */
        cu_cpu_info->cu_flag |= CU_CPU_CNTRS_ON | CU_CPU_CNTRS_OFF_ON;

        /*
         * Update counter statistics
         */
        (void) cu_cpu_update(cp, B_FALSE);

        cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_OFF_ON;

        *err = 0;
        kpreempt_enable();
}


/*
 * Cross call wrapper routine for cu_cpc_program()
 *
 * Checks to make sure that counters on CPU aren't being used by someone else
 * before calling cu_cpc_program() since cu_cpc_program() needs to assert that
 * nobody else is using the counters to catch and prevent any broken code.
 * Also, this check needs to happen on the target CPU since the CPU's CPC
 * context can only be changed while running on the CPU.
 *
 * If the first argument is TRUE, cu_cpc_program_xcall also checks that there is
 * no valid thread bound cpc context. This is important to check to prevent
 * re-programming thread counters with CU counters when CPU is coming on-line.
 */
static void
cu_cpc_program_xcall(uintptr_t arg, int *err)
{
        boolean_t       avoid_thread_context = (boolean_t)arg;

        kpreempt_disable();

        if (CPU->cpu_cpc_ctx != NULL &&
            !(CPU->cpu_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
                *err = -100;
                kpreempt_enable();
                return;
        }

        if (avoid_thread_context && (curthread->t_cpc_ctx != NULL) &&
            !(curthread->t_cpc_ctx->kc_flags & KCPC_CTX_INVALID_STOPPED)) {
                *err = -200;
                kpreempt_enable();
                return;
        }

        cu_cpc_program(CPU, err);
        kpreempt_enable();
}


/*
 * Unprogram counters for capacity and utilization on given CPU
 * This function should be always executed on the target CPU at high PIL
 */
void
cu_cpc_unprogram(cpu_t *cp, int *err)
{
        cu_cpc_ctx_t    *cpu_ctx;
        kcpc_ctx_t      *ctx;
        cu_cpu_info_t   *cu_cpu_info;

        ASSERT(IS_HIPIL());
        /*
         * Should be running on given CPU with preemption disabled to keep CPU
         * from disappearing and make sure flags and CPC context don't change
         * from underneath us
         */
        kpreempt_disable();
        ASSERT(cp == CPU);

        /*
         * Module not on
         */
        if (!(cu_flags & CU_FLAG_ON)) {
                *err = -1;
                kpreempt_enable();
                return;
        }

        cu_cpu_info = cp->cpu_cu_info;
        if (cu_cpu_info == NULL) {
                *err = -3;
                kpreempt_enable();
                return;
        }

        /*
         * Counters turned off already
         */
        if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON)) {
                *err = 1;
                kpreempt_enable();
                return;
        }

        /*
         * Update counter statistics
         */
        (void) cu_cpu_update(cp, B_FALSE);

        /*
         * Get CPU's CPC context needed for capacity and utilization
         */
        cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
        if (cpu_ctx->nctx <= 0 || cpu_ctx->ctx_ptr_array == NULL ||
            cpu_ctx->ctx_ptr_array_sz <= 0) {
                *err = -5;
                kpreempt_enable();
                return;
        }
        ctx = cpu_ctx->ctx_ptr_array[cpu_ctx->cur_index];

        /*
         * CPU's CPC context should be current capacity and utilization CPC
         * context
         */
        ASSERT(cp->cpu_cpc_ctx == ctx);
        if (cp->cpu_cpc_ctx != ctx) {
                *err = -6;
                kpreempt_enable();
                return;
        }

        /*
         * Unprogram counters on CPU.
         */
        kcpc_unprogram(ctx, B_FALSE);

        ASSERT(ctx->kc_flags & KCPC_CTX_INVALID_STOPPED);

        /*
         * Unset state in CPU structure saying that CPU's counters are
         * programmed
         */
        cp->cpu_cpc_ctx = NULL;
        cu_cpu_info->cu_flag &= ~CU_CPU_CNTRS_ON;

        *err = 0;
        kpreempt_enable();
}


/*
 * Add given counter event to list of CPC requests
 */
static int
cu_cpc_req_add(char *event, kcpc_request_list_t *reqs, int nreqs,
    cu_cntr_stats_t *stats, int kmem_flags, int *nevents)
{
        int     n;
        int     retval;
        uint_t  flags;

        /*
         * Return error when no counter event specified, counter event not
         * supported by CPC's PCBE, or number of events not given
         */
        if (event == NULL || kcpc_event_supported(event) == B_FALSE ||
            nevents == NULL)
                return (-1);

        n = *nevents;

        /*
         * Only count number of counter events needed if list
         * where to add CPC requests not given
         */
        if (reqs == NULL) {
                n++;
                *nevents = n;
                return (-3);
        }

        /*
         * Return error when stats not given or not enough room on list of CPC
         * requests for more counter events
         */
        if (stats == NULL || (nreqs <= 0 && n >= nreqs))
                return (-4);

        /*
         * Use flags in cu_cpc_flags to program counters and enable overflow
         * interrupts/traps (unless PCBE can't handle overflow interrupts) so
         * PCBE can catch counters before they wrap to hopefully give us an
         * accurate (64-bit) virtualized counter
         */
        flags = cu_cpc_flags;
        if ((kcpc_pcbe_capabilities() & CPC_CAP_OVERFLOW_INTERRUPT) == 0)
                flags &= ~CPC_OVF_NOTIFY_EMT;

        /*
         * Add CPC request to list
         */
        retval = kcpc_reqs_add(reqs, event, cu_cpc_preset_value,
            flags, 0, NULL, &stats[n], kmem_flags);

        if (retval != 0)
                return (-5);

        n++;
        *nevents = n;
        return (0);
}

static void
cu_cpu_info_detach_xcall(void)
{
        ASSERT(IS_HIPIL());

        CPU->cpu_cu_info = NULL;
}


/*
 * Enable or disable collection of capacity/utilization data for a current CPU.
 * Counters are enabled if 'on' argument is True and disabled if it is False.
 * This function should be always executed at high PIL
 */
static void
cu_cpc_trigger(uintptr_t arg1, uintptr_t arg2)
{
        cpu_t           *cp = (cpu_t *)arg1;
        boolean_t       on = (boolean_t)arg2;
        int             error;
        cu_cpu_info_t   *cu_cpu_info;

        ASSERT(IS_HIPIL());
        kpreempt_disable();
        ASSERT(cp == CPU);

        if (!(cu_flags & CU_FLAG_ON)) {
                kpreempt_enable();
                return;
        }

        cu_cpu_info = cp->cpu_cu_info;
        if (cu_cpu_info == NULL) {
                kpreempt_enable();
                return;
        }

        ASSERT(!cu_cpu_info->cu_disabled ||
            !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));

        if (on) {
                /*
                 * Decrement the cu_disabled counter.
                 * Once it drops to zero, call cu_cpc_program.
                 */
                if (cu_cpu_info->cu_disabled > 0)
                        cu_cpu_info->cu_disabled--;
                if (cu_cpu_info->cu_disabled == 0)
                        cu_cpc_program(CPU, &error);
        } else if (cu_cpu_info->cu_disabled++ == 0) {
                /*
                 * This is the first attempt to disable CU, so turn it off
                 */
                cu_cpc_unprogram(cp, &error);
                ASSERT(!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON));
        }

        kpreempt_enable();
}


/*
 * Callback for changes in CPU states
 * Used to enable or disable hardware performance counters on CPUs that are
 * turned on or off
 *
 * NOTE: cpc should be programmed/unprogrammed while running on the target CPU.
 * We have to use thread_affinity_set to hop to the right CPU because these
 * routines expect cpu_lock held, so we can't cross-call other CPUs while
 * holding CPU lock.
 */
static int
/* LINTED E_FUNC_ARG_UNUSED */
cu_cpu_callback(cpu_setup_t what, int id, void *arg)
{
        cpu_t   *cp;
        int     retval = 0;

        ASSERT(MUTEX_HELD(&cpu_lock));

        if (!(cu_flags & CU_FLAG_ON))
                return (-1);

        cp = cpu_get(id);
        if (cp == NULL)
                return (-2);

        switch (what) {
        case CPU_ON:
                /*
                 * Setup counters on CPU being turned on
                 */
                retval = cu_cpu_init(cp, cu_cpc_reqs);

                /*
                 * Reset list of counter event requests so its space can be
                 * reused for a different set of requests for next CPU
                 */
                (void) kcpc_reqs_reset(cu_cpc_reqs);
                break;
        case CPU_INTR_ON:
                /*
                 * Setup counters on CPU being turned on.
                 */
                retval = cu_cpu_run(cp, cu_cpc_program_xcall,
                    (uintptr_t)B_TRUE);
                break;
        case CPU_OFF:
                /*
                 * Disable counters on CPU being turned off. Counters will not
                 * be re-enabled on this CPU until it comes back online.
                 */
                cu_cpu_disable(cp);
                ASSERT(!CU_CPC_ON(cp));
                retval = cu_cpu_fini(cp);
                break;
        default:
                break;
        }
        return (retval);
}


/*
 * Disable or enable Capacity Utilization counters on a given CPU. This function
 * can be called from any CPU to disable counters on the given CPU.
 */
static void
cu_cpu_disable(cpu_t *cp)
{
        cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_FALSE);
}


static void
cu_cpu_enable(cpu_t *cp)
{
        cpu_call(cp, cu_cpc_trigger, (uintptr_t)cp, (uintptr_t)B_TRUE);
}


/*
 * Setup capacity and utilization support for given CPU
 *
 * NOTE: Use KM_NOSLEEP for kmem_{,z}alloc() since cpu_lock is held and free
 *       everything that has been successfully allocated including cpu_cu_info
 *      if any memory allocation fails
 */
static int
cu_cpu_init(cpu_t *cp, kcpc_request_list_t *reqs)
{
        kcpc_ctx_t      **ctx_ptr_array;
        size_t          ctx_ptr_array_sz;
        cu_cpc_ctx_t    *cpu_ctx;
        cu_cpu_info_t   *cu_cpu_info;
        int             n;

        /*
         * cpu_lock should be held and protect against CPU going away and races
         * with cu_{init,fini,cpu_fini}()
         */
        ASSERT(MUTEX_HELD(&cpu_lock));

        /*
         * Return if not ready to setup counters yet
         */
        if (!(cu_flags & CU_FLAG_READY))
                return (-1);

        if (cp->cpu_cu_info == NULL) {
                cp->cpu_cu_info = kmem_zalloc(sizeof (cu_cpu_info_t),
                    KM_NOSLEEP);
                if (cp->cpu_cu_info == NULL)
                        return (-2);
        }

        /*
         * Get capacity and utilization CPC context for CPU and check to see
         * whether it has been setup already
         */
        cu_cpu_info = cp->cpu_cu_info;
        cu_cpu_info->cu_cpu = cp;
        cu_cpu_info->cu_disabled = dtrace_cpc_in_use ? 1 : 0;

        cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
        if (cpu_ctx->nctx > 0 && cpu_ctx->ctx_ptr_array != NULL &&
            cpu_ctx->ctx_ptr_array_sz > 0) {
                return (1);
        }

        /*
         * Should have no contexts since it hasn't been setup already
         */
        ASSERT(cpu_ctx->nctx == 0 && cpu_ctx->ctx_ptr_array == NULL &&
            cpu_ctx->ctx_ptr_array_sz == 0);

        /*
         * Determine how many CPC events needed to measure capacity and
         * utilization for this CPU, allocate space for counter statistics for
         * each event, and fill in list of CPC event requests with corresponding
         * counter stats for each request to make attributing counter data
         * easier later....
         */
        n = cu_cpc_init(cp, NULL, 0);
        if (n <= 0) {
                (void) cu_cpu_fini(cp);
                return (-3);
        }

        cu_cpu_info->cu_cntr_stats = kmem_zalloc(n * sizeof (cu_cntr_stats_t),
            KM_NOSLEEP);
        if (cu_cpu_info->cu_cntr_stats == NULL) {
                (void) cu_cpu_fini(cp);
                return (-4);
        }

        cu_cpu_info->cu_ncntr_stats = n;

        n = cu_cpc_init(cp, reqs, n);
        if (n <= 0) {
                (void) cu_cpu_fini(cp);
                return (-5);
        }

        /*
         * Create CPC context with given requests
         */
        ctx_ptr_array = NULL;
        ctx_ptr_array_sz = 0;
        n = kcpc_cpu_ctx_create(cp, reqs, KM_NOSLEEP, &ctx_ptr_array,
            &ctx_ptr_array_sz);
        if (n <= 0) {
                (void) cu_cpu_fini(cp);
                return (-6);
        }

        /*
         * Should have contexts
         */
        ASSERT(n > 0 && ctx_ptr_array != NULL && ctx_ptr_array_sz > 0);
        if (ctx_ptr_array == NULL || ctx_ptr_array_sz <= 0) {
                (void) cu_cpu_fini(cp);
                return (-7);
        }

        /*
         * Fill in CPC context info for CPU needed for capacity and utilization
         */
        cpu_ctx->cur_index = 0;
        cpu_ctx->nctx = n;
        cpu_ctx->ctx_ptr_array = ctx_ptr_array;
        cpu_ctx->ctx_ptr_array_sz = ctx_ptr_array_sz;
        return (0);
}

/*
 * Tear down capacity and utilization support for given CPU
 */
static int
cu_cpu_fini(cpu_t *cp)
{
        kcpc_ctx_t      *ctx;
        cu_cpc_ctx_t    *cpu_ctx;
        cu_cpu_info_t   *cu_cpu_info;
        int             i;
        pghw_type_t     pg_hw_type;

        /*
         * cpu_lock should be held and protect against CPU going away and races
         * with cu_{init,fini,cpu_init}()
         */
        ASSERT(MUTEX_HELD(&cpu_lock));

        /*
         * Have to at least be ready to setup counters to have allocated
         * anything that needs to be deallocated now
         */
        if (!(cu_flags & CU_FLAG_READY))
                return (-1);

        /*
         * Nothing to do if CPU's capacity and utilization info doesn't exist
         */
        cu_cpu_info = cp->cpu_cu_info;
        if (cu_cpu_info == NULL)
                return (1);

        /*
         * Tear down any existing kstats and counter info for each hardware
         * sharing relationship
         */
        for (pg_hw_type = PGHW_START; pg_hw_type < PGHW_NUM_COMPONENTS;
            pg_hw_type++) {
                cu_cntr_info_t  *cntr_info;

                cntr_info = cu_cpu_info->cu_cntr_info[pg_hw_type];
                if (cntr_info == NULL)
                        continue;

                if (cntr_info->ci_kstat != NULL) {
                        kstat_delete(cntr_info->ci_kstat);
                        cntr_info->ci_kstat = NULL;
                }
                kmem_free(cntr_info, sizeof (cu_cntr_info_t));
        }

        /*
         * Free counter statistics for CPU
         */
        ASSERT(cu_cpu_info->cu_cntr_stats == NULL ||
            cu_cpu_info->cu_ncntr_stats > 0);
        if (cu_cpu_info->cu_cntr_stats != NULL &&
            cu_cpu_info->cu_ncntr_stats > 0) {
                kmem_free(cu_cpu_info->cu_cntr_stats,
                    cu_cpu_info->cu_ncntr_stats * sizeof (cu_cntr_stats_t));
                cu_cpu_info->cu_cntr_stats = NULL;
                cu_cpu_info->cu_ncntr_stats = 0;
        }

        /*
         * Get capacity and utilization CPC contexts for given CPU and check to
         * see whether they have been freed already
         */
        cpu_ctx = &cu_cpu_info->cu_cpc_ctx;
        if (cpu_ctx != NULL && cpu_ctx->ctx_ptr_array != NULL &&
            cpu_ctx->ctx_ptr_array_sz > 0) {
                /*
                 * Free CPC contexts for given CPU
                 */
                for (i = 0; i < cpu_ctx->nctx; i++) {
                        ctx = cpu_ctx->ctx_ptr_array[i];
                        if (ctx == NULL)
                                continue;
                        kcpc_free_cpu(ctx);
                }

                /*
                 * Free CPC context pointer array
                 */
                kmem_free(cpu_ctx->ctx_ptr_array, cpu_ctx->ctx_ptr_array_sz);

                /*
                 * Zero CPC info for CPU
                 */
                bzero(cpu_ctx, sizeof (cu_cpc_ctx_t));
        }

        /*
         * Set cp->cpu_cu_info pointer to NULL. Go through cross-call to ensure
         * that no one is going to access the cpu_cu_info whicch we are going to
         * free.
         */
        if (cpu_is_online(cp))
                cpu_call(cp, (cpu_call_func_t)cu_cpu_info_detach_xcall, 0, 0);
        else
                cp->cpu_cu_info = NULL;

        /*
         * Free CPU's capacity and utilization info
         */
        kmem_free(cu_cpu_info, sizeof (cu_cpu_info_t));

        return (0);
}

/*
 * Create capacity & utilization kstats for given PG CPU hardware sharing
 * relationship
 */
static void
cu_cpu_kstat_create(pghw_t *pg, cu_cntr_info_t *cntr_info)
{
        kstat_t         *ks;
        char            *sharing = pghw_type_string(pg->pghw_hw);
        char            name[KSTAT_STRLEN + 1];

        /*
         * Just return when no counter info or CPU
         */
        if (cntr_info == NULL || cntr_info->ci_cpu == NULL)
                return;

        /*
         * Canonify PG name to conform to kstat name rules
         */
        (void) strncpy(name, pghw_type_string(pg->pghw_hw), KSTAT_STRLEN + 1);
        strident_canon(name, TASKQ_NAMELEN + 1);

        if ((ks = kstat_create_zone("pg_hw_perf_cpu",
            cntr_info->ci_cpu->cpu_id,
            name, "processor_group", KSTAT_TYPE_NAMED,
            sizeof (cu_cpu_kstat) / sizeof (kstat_named_t),
            KSTAT_FLAG_VIRTUAL, GLOBAL_ZONEID)) == NULL)
                return;

        ks->ks_lock = &pg_cpu_kstat_lock;
        ks->ks_data = &cu_cpu_kstat;
        ks->ks_update = cu_cpu_kstat_update;
        ks->ks_data_size += strlen(sharing) + 1;

        ks->ks_private = cntr_info;
        cntr_info->ci_kstat = ks;
        kstat_install(cntr_info->ci_kstat);
}


/*
 * Propagate values from CPU capacity & utilization stats to kstats
 */
static int
cu_cpu_kstat_update(kstat_t *ksp, int rw)
{
        cpu_t           *cp;
        cu_cntr_info_t  *cntr_info = ksp->ks_private;
        struct cu_cpu_kstat     *kstat = &cu_cpu_kstat;
        pghw_t          *pg;
        cu_cntr_stats_t *stats;

        if (rw == KSTAT_WRITE)
                return (EACCES);

        cp = cntr_info->ci_cpu;
        pg = cntr_info->ci_pg;
        kstat->cu_cpu_id.value.ui32 = cp->cpu_id;
        kstat->cu_pg_id.value.i32 = ((pg_t *)pg)->pg_id;

        /*
         * The caller should have priv_cpc_cpu privilege to get utilization
         * data. Callers who do not have the privilege will see zeroes as the
         * values.
         */
        if (secpolicy_cpc_cpu(crgetcred()) != 0) {
                kstat->cu_generation.value.ui32 = cp->cpu_generation;
                kstat_named_setstr(&kstat->cu_cpu_relationship,
                    pghw_type_string(pg->pghw_hw));

                kstat->cu_cpu_util.value.ui64 = 0;
                kstat->cu_cpu_rate.value.ui64 = 0;
                kstat->cu_cpu_rate_max.value.ui64 = 0;
                kstat->cu_cpu_time_running.value.ui64 = 0;
                kstat->cu_cpu_time_stopped.value.ui64 = 0;

                return (0);
        }

        kpreempt_disable();

        /*
         * Update capacity and utilization statistics needed for CPU's PG (CPU)
         * kstats
         */

        (void) cu_cpu_update(cp, B_TRUE);

        stats = cntr_info->ci_stats;
        kstat->cu_generation.value.ui32 = cp->cpu_generation;
        kstat_named_setstr(&kstat->cu_cpu_relationship,
            pghw_type_string(pg->pghw_hw));

        kstat->cu_cpu_util.value.ui64 = stats->cs_value_total;
        kstat->cu_cpu_rate.value.ui64 = stats->cs_rate;
        kstat->cu_cpu_rate_max.value.ui64 = stats->cs_rate_max;
        kstat->cu_cpu_time_running.value.ui64 = stats->cs_time_running;
        kstat->cu_cpu_time_stopped.value.ui64 = stats->cs_time_stopped;

        /*
         * Counters are stopped now, so the cs_time_stopped was last
         * updated at cs_time_start time. Add the time passed since then
         * to the stopped time.
         */
        if (!(cp->cpu_cu_info->cu_flag & CU_CPU_CNTRS_ON))
                kstat->cu_cpu_time_stopped.value.ui64 +=
                    gethrtime() - stats->cs_time_start;

        kpreempt_enable();

        return (0);
}

/*
 * Run specified function with specified argument on a given CPU and return
 * whatever the function returns
 */
static int
cu_cpu_run(cpu_t *cp, cu_cpu_func_t func, uintptr_t arg)
{
        int error = 0;

        /*
         * cpu_call() will call func on the CPU specified with given argument
         * and return func's return value in last argument
         */
        cpu_call(cp, (cpu_call_func_t)(uintptr_t)func, arg, (uintptr_t)&error);
        return (error);
}


/*
 * Update counter statistics on a given CPU.
 *
 * If move_to argument is True, execute the function on the CPU specified
 * Otherwise, assume that it is already runninng on the right CPU
 *
 * If move_to is specified, the caller should hold cpu_lock or have preemption
 * disabled. Otherwise it is up to the caller to guarantee that things do not
 * change in the process.
 */
int
cu_cpu_update(struct cpu *cp, boolean_t move_to)
{
        int     retval;
        cu_cpu_info_t   *cu_cpu_info = cp->cpu_cu_info;
        hrtime_t        time_snap;

        ASSERT(!move_to || MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0);

        /*
         * Nothing to do if counters are not programmed
         */
        if (!(cu_flags & CU_FLAG_ON) ||
            (cu_cpu_info == NULL) ||
            !(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
                return (0);

        /*
         * Don't update CPU statistics if it was updated recently
         * and provide old results instead
         */
        time_snap = gethrtime();
        if ((time_snap - cu_cpu_info->cu_sample_time) < cu_update_threshold) {
                DTRACE_PROBE1(cu__drop__sample, cpu_t *, cp);
                return (0);
        }

        cu_cpu_info->cu_sample_time = time_snap;

        /*
         * CPC counter should be read on the CPU that is running the counter. We
         * either have to move ourselves to the target CPU or insure that we
         * already run there.
         *
         * We use cross-call to the target CPU to execute kcpc_read() and
         * cu_cpu_update_stats() there.
         */
        retval = 0;
        if (move_to)
                (void) cu_cpu_run(cp, (cu_cpu_func_t)(uintptr_t)kcpc_read,
                    (uintptr_t)cu_cpu_update_stats);
        else {
                retval = kcpc_read((kcpc_update_func_t)cu_cpu_update_stats);
                /*
                 * Offset negative return value by -10 so we can distinguish it
                 * from error return values of this routine vs kcpc_read()
                 */
                if (retval < 0)
                        retval -= 10;
        }

        return (retval);
}


/*
 * Update CPU counter statistics for current CPU.
 * This function may be called from a cross-call
 */
static int
cu_cpu_update_stats(cu_cntr_stats_t *stats, uint64_t cntr_value)
{
        cu_cpu_info_t   *cu_cpu_info = CPU->cpu_cu_info;
        uint_t          flags;
        uint64_t        delta;
        hrtime_t        time_delta;
        hrtime_t        time_snap;

        if (stats == NULL)
                return (-1);

        /*
         * Nothing to do if counters are not programmed. This should not happen,
         * but we check just in case.
         */
        ASSERT(cu_flags & CU_FLAG_ON);
        ASSERT(cu_cpu_info != NULL);
        if (!(cu_flags & CU_FLAG_ON) ||
            (cu_cpu_info == NULL))
                return (-2);

        flags = cu_cpu_info->cu_flag;
        ASSERT(flags & CU_CPU_CNTRS_ON);
        if (!(flags & CU_CPU_CNTRS_ON))
                return (-2);

        /*
         * Take snapshot of high resolution timer
         */
        time_snap = gethrtime();

        /*
         * CU counters have just been programmed. We cannot assume that the new
         * cntr_value continues from where we left off, so use the cntr_value as
         * the new initial value.
         */
        if (flags & CU_CPU_CNTRS_OFF_ON)
                stats->cs_value_start = cntr_value;

        /*
         * Calculate delta in counter values between start of sampling period
         * and now
         */
        delta = cntr_value - stats->cs_value_start;

        /*
         * Calculate time between start of sampling period and now
         */
        time_delta = stats->cs_time_start ?
            time_snap - stats->cs_time_start :
            0;
        stats->cs_time_start = time_snap;
        stats->cs_value_start = cntr_value;

        if (time_delta > 0) { /* wrap shouldn't happen */
                /*
                 * Update either running or stopped time based on the transition
                 * state
                 */
                if (flags & CU_CPU_CNTRS_OFF_ON)
                        stats->cs_time_stopped += time_delta;
                else
                        stats->cs_time_running += time_delta;
        }

        /*
         * Update rest of counter statistics if counter value didn't wrap
         */
        if (delta > 0) {
                /*
                 * Update utilization rate if the interval between samples is
                 * sufficient.
                 */
                ASSERT(cu_sample_interval_min > CU_SCALE);
                if (time_delta > cu_sample_interval_min)
                        stats->cs_rate = CU_RATE(delta, time_delta);
                if (stats->cs_rate_max < stats->cs_rate)
                        stats->cs_rate_max = stats->cs_rate;

                stats->cs_value_last = delta;
                stats->cs_value_total += delta;
        }

        return (0);
}

/*
 * Update CMT PG utilization data.
 *
 * This routine computes the running total utilization and times for the
 * specified PG by adding up the total utilization and counter running and
 * stopped times of all CPUs in the PG and calculates the utilization rate and
 * maximum rate for all CPUs in the PG.
 */
void
cu_pg_update(pghw_t *pg)
{
        pg_cpu_itr_t    cpu_iter;
        pghw_type_t     pg_hwtype;
        cpu_t           *cpu;
        pghw_util_t     *hw_util = &pg->pghw_stats;
        uint64_t        old_utilization = hw_util->pghw_util;
        hrtime_t        now;
        hrtime_t        time_delta;
        uint64_t        utilization_delta;

        ASSERT(MUTEX_HELD(&cpu_lock));

        now = gethrtime();

        pg_hwtype = pg->pghw_hw;

        /*
         * Initialize running total utilization and times for PG to 0
         */
        hw_util->pghw_util = 0;
        hw_util->pghw_time_running = 0;
        hw_util->pghw_time_stopped = 0;

        /*
         * Iterate over all CPUs in the PG and aggregate utilization, running
         * time and stopped time.
         */
        PG_CPU_ITR_INIT(pg, cpu_iter);
        while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
                cu_cpu_info_t   *cu_cpu_info = cpu->cpu_cu_info;
                cu_cntr_info_t  *cntr_info;
                cu_cntr_stats_t *stats;

                if (cu_cpu_info == NULL)
                        continue;

                /*
                 * Update utilization data for the CPU and then
                 * aggregate per CPU running totals for PG
                 */
                (void) cu_cpu_update(cpu, B_TRUE);
                cntr_info = cu_cpu_info->cu_cntr_info[pg_hwtype];

                if (cntr_info == NULL || (stats = cntr_info->ci_stats) == NULL)
                        continue;

                hw_util->pghw_util += stats->cs_value_total;
                hw_util->pghw_time_running += stats->cs_time_running;
                hw_util->pghw_time_stopped += stats->cs_time_stopped;

                /*
                 * If counters are stopped now, the pg_time_stopped was last
                 * updated at cs_time_start time. Add the time passed since then
                 * to the stopped time.
                 */
                if (!(cu_cpu_info->cu_flag & CU_CPU_CNTRS_ON))
                        hw_util->pghw_time_stopped +=
                            now - stats->cs_time_start;
        }

        /*
         * Compute per PG instruction rate and maximum rate
         */
        time_delta = now - hw_util->pghw_time_stamp;
        hw_util->pghw_time_stamp = now;

        if (old_utilization == 0)
                return;

        /*
         * Calculate change in utilization over sampling period and set this to
         * 0 if the delta would be 0 or negative which may happen if any CPUs go
         * offline during the sampling period
         */
        if (hw_util->pghw_util > old_utilization)
                utilization_delta = hw_util->pghw_util - old_utilization;
        else
                utilization_delta = 0;

        /*
         * Update utilization rate if the interval between samples is
         * sufficient.
         */
        ASSERT(cu_sample_interval_min > CU_SCALE);
        if (time_delta > CU_SAMPLE_INTERVAL_MIN)
                hw_util->pghw_rate = CU_RATE(utilization_delta, time_delta);

        /*
         * Update the maximum observed rate
         */
        if (hw_util->pghw_rate_max < hw_util->pghw_rate)
                hw_util->pghw_rate_max = hw_util->pghw_rate;
}