root/usr/src/uts/i86pc/os/cpupm/cpu_idle.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright (c) 2009-2010, Intel Corporation.
 * All rights reserved.
 */
/*
 * Copyright 2019 Joyent, Inc.
 * Copyright 2025 Oxide Computer Company
 */

#include <sys/x86_archext.h>
#include <sys/machsystm.h>
#include <sys/x_call.h>
#include <sys/stat.h>
#include <sys/acpi/acpi.h>
#include <sys/acpica.h>
#include <sys/cpu_acpi.h>
#include <sys/cpu_idle.h>
#include <sys/cpupm.h>
#include <sys/cpu_event.h>
#include <sys/hpet.h>
#include <sys/archsystm.h>
#include <vm/hat_i86.h>
#include <sys/dtrace.h>
#include <sys/sdt.h>
#include <sys/callb.h>

#define CSTATE_USING_HPET               1
#define CSTATE_USING_LAT                2

#define CPU_IDLE_STOP_TIMEOUT           1000

extern void cpu_idle_adaptive(void);
extern uint32_t cpupm_next_cstate(cma_c_state_t *cs_data,
    cpu_acpi_cstate_t *cstates, uint32_t cs_count, hrtime_t start);

static int cpu_idle_init(cpu_t *);
static void cpu_idle_fini(cpu_t *);
static void cpu_idle_stop(cpu_t *);
static boolean_t cpu_deep_idle_callb(void *arg, int code);
static boolean_t cpu_idle_cpr_callb(void *arg, int code);
static void acpi_cpu_cstate(cpu_acpi_cstate_t *cstate);

static boolean_t cstate_use_timer(hrtime_t *lapic_expire, int timer);

/*
 * the flag of always-running local APIC timer.
 * the flag of HPET Timer use in deep cstate.
 */
static boolean_t cpu_cstate_arat = B_FALSE;
static boolean_t cpu_cstate_hpet = B_FALSE;

/*
 * Interfaces for modules implementing Intel's deep c-state.
 */
cpupm_state_ops_t cpu_idle_ops = {
        "Generic ACPI C-state Support",
        cpu_idle_init,
        cpu_idle_fini,
        NULL,
        cpu_idle_stop
};

static kmutex_t         cpu_idle_callb_mutex;
static callb_id_t       cpu_deep_idle_callb_id;
static callb_id_t       cpu_idle_cpr_callb_id;
static uint_t           cpu_idle_cfg_state;

static kmutex_t cpu_idle_mutex;

cpu_idle_kstat_t cpu_idle_kstat = {
        { "address_space_id",   KSTAT_DATA_STRING },
        { "latency",            KSTAT_DATA_UINT32 },
        { "power",              KSTAT_DATA_UINT32 },
};

/*
 * kstat update function of the c-state info
 */
static int
cpu_idle_kstat_update(kstat_t *ksp, int flag)
{
        cpu_acpi_cstate_t *cstate = ksp->ks_private;

        if (flag == KSTAT_WRITE) {
                return (EACCES);
        }

        if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
                kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
                "FFixedHW");
        } else if (cstate->cs_addrspace_id == ACPI_ADR_SPACE_SYSTEM_IO) {
                kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
                "SystemIO");
        } else {
                kstat_named_setstr(&cpu_idle_kstat.addr_space_id,
                "Unsupported");
        }

        cpu_idle_kstat.cs_latency.value.ui32 = cstate->cs_latency;
        cpu_idle_kstat.cs_power.value.ui32 = cstate->cs_power;

        return (0);
}

/*
 * Used during configuration callbacks to manage implementation specific
 * details of the hardware timer used during Deep C-state.
 */
boolean_t
cstate_timer_callback(int code)
{
        if (cpu_cstate_arat) {
                return (B_TRUE);
        } else if (cpu_cstate_hpet) {
                return (hpet.callback(code));
        }
        return (B_FALSE);
}

/*
 * Some Local APIC Timers do not work during Deep C-states.
 * The Deep C-state idle function uses this function to ensure it is using a
 * hardware timer that works during Deep C-states.  This function also
 * switches the timer back to the LACPI Timer after Deep C-state.
 */
static boolean_t
cstate_use_timer(hrtime_t *lapic_expire, int timer)
{
        if (cpu_cstate_arat)
                return (B_TRUE);

        /*
         * We have to return B_FALSE if no arat or hpet support
         */
        if (!cpu_cstate_hpet)
                return (B_FALSE);

        switch (timer) {
        case CSTATE_USING_HPET:
                return (hpet.use_hpet_timer(lapic_expire));
        case CSTATE_USING_LAT:
                hpet.use_lapic_timer(*lapic_expire);
                return (B_TRUE);
        default:
                return (B_FALSE);
        }
}

/*
 * c-state wakeup function.
 * Similar to cpu_wakeup and cpu_wakeup_mwait except this function deals
 * with CPUs asleep in MWAIT, HLT, or ACPI Deep C-State.
 */
void
cstate_wakeup(cpu_t *cp, int bound)
{
        struct machcpu  *mcpu = &(cp->cpu_m);
        volatile uint32_t *mcpu_mwait = mcpu->mcpu_mwait;
        cpupart_t       *cpu_part;
        uint_t          cpu_found;
        processorid_t   cpu_sid;

        cpu_part = cp->cpu_part;
        cpu_sid = cp->cpu_seqid;
        /*
         * Clear the halted bit for that CPU since it will be woken up
         * in a moment.
         */
        if (bitset_in_set(&cpu_part->cp_haltset, cpu_sid)) {
                /*
                 * Clear the halted bit for that CPU since it will be
                 * poked in a moment.
                 */
                bitset_atomic_del(&cpu_part->cp_haltset, cpu_sid);

                /*
                 * We may find the current CPU present in the halted cpuset
                 * if we're in the context of an interrupt that occurred
                 * before we had a chance to clear our bit in cpu_idle().
                 * Waking ourself is obviously unnecessary, since if
                 * we're here, we're not halted.
                 */
                if (cp != CPU) {
                        /*
                         * Use correct wakeup mechanism
                         */
                        if ((mcpu_mwait != NULL) &&
                            (*mcpu_mwait == MWAIT_HALTED))
                                MWAIT_WAKEUP(cp);
                        else
                                poke_cpu(cp->cpu_id);
                }
                return;
        } else {
                /*
                 * This cpu isn't halted, but it's idle or undergoing a
                 * context switch. No need to awaken anyone else.
                 */
                if (cp->cpu_thread == cp->cpu_idle_thread ||
                    cp->cpu_disp_flags & CPU_DISP_DONTSTEAL)
                        return;
        }

        /*
         * No need to wake up other CPUs if the thread we just enqueued
         * is bound.
         */
        if (bound)
                return;


        /*
         * See if there's any other halted CPUs. If there are, then
         * select one, and awaken it.
         * It's possible that after we find a CPU, somebody else
         * will awaken it before we get the chance.
         * In that case, look again.
         */
        do {
                cpu_found = bitset_find(&cpu_part->cp_haltset);
                if (cpu_found == (uint_t)-1)
                        return;

        } while (bitset_atomic_test_and_del(&cpu_part->cp_haltset,
            cpu_found) < 0);

        /*
         * Must use correct wakeup mechanism to avoid lost wakeup of
         * alternate cpu.
         */
        if (cpu_found != CPU->cpu_seqid) {
                mcpu_mwait = cpu_seq[cpu_found]->cpu_m.mcpu_mwait;
                if ((mcpu_mwait != NULL) && (*mcpu_mwait == MWAIT_HALTED))
                        MWAIT_WAKEUP(cpu_seq[cpu_found]);
                else
                        poke_cpu(cpu_seq[cpu_found]->cpu_id);
        }
}

/*
 * Function called by CPU idle notification framework to check whether CPU
 * has been awakened. It will be called with interrupt disabled.
 * If CPU has been awakened, call cpu_idle_exit() to notify CPU idle
 * notification framework.
 */
static void
acpi_cpu_mwait_check_wakeup(void *arg)
{
        volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;

        ASSERT(arg != NULL);
        if (*mcpu_mwait != MWAIT_HALTED) {
                /*
                 * CPU has been awakened, notify CPU idle notification system.
                 */
                cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
        } else {
                /*
                 * Toggle interrupt flag to detect pending interrupts.
                 * If interrupt happened, do_interrupt() will notify CPU idle
                 * notification framework so no need to call cpu_idle_exit()
                 * here.
                 */
                sti();
                SMT_PAUSE();
                cli();
        }
}

static void
acpi_cpu_mwait_ipi_check_wakeup(void *arg)
{
        volatile uint32_t *mcpu_mwait = (volatile uint32_t *)arg;

        ASSERT(arg != NULL);
        if (*mcpu_mwait != MWAIT_WAKEUP_IPI) {
                /*
                 * CPU has been awakened, notify CPU idle notification system.
                 */
                cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
        } else {
                /*
                 * Toggle interrupt flag to detect pending interrupts.
                 * If interrupt happened, do_interrupt() will notify CPU idle
                 * notification framework so no need to call cpu_idle_exit()
                 * here.
                 */
                sti();
                SMT_PAUSE();
                cli();
        }
}

/*ARGSUSED*/
static void
acpi_cpu_check_wakeup(void *arg)
{
        /*
         * Toggle interrupt flag to detect pending interrupts.
         * If interrupt happened, do_interrupt() will notify CPU idle
         * notification framework so no need to call cpu_idle_exit() here.
         */
        sti();
        SMT_PAUSE();
        cli();
}

/*
 * Idle the current CPU via ACPI-defined System I/O read to an ACPI-specified
 * address.
 */
static void
acpi_io_idle(uint32_t address)
{
        uint32_t value;
        ACPI_TABLE_FADT *gbl_FADT;

        /*
         * Do we need to work around an ancient chipset bug in early ACPI
         * implementations that would result in a late STPCLK# assertion?
         *
         * Must be true when running on systems where the ACPI-indicated I/O
         * read to enter low-power states may resolve before actually stopping
         * the processor that initiated a low-power transition. On such systems,
         * it is possible the processor would proceed past the idle point and
         * *then* be stopped.
         *
         * An early workaround that has been carried forward is to read the ACPI
         * PM Timer after requesting a low-power transition. The timer read will
         * take long enough that we are certain the processor is safe to be
         * stopped.
         *
         * From some investigation, this was only ever necessary on older Intel
         * chipsets. Additionally, the timer read can take upwards of a thousand
         * CPU clocks, so for systems that work correctly, it's just a tarpit
         * for the CPU as it is woken back up.
         */
        boolean_t need_stpclk_workaround =
            cpuid_getvendor(CPU) == X86_VENDOR_Intel;

        /*
         * The following call will cause us to halt which will cause the store
         * buffer to be repartitioned, potentially exposing us to the Intel CPU
         * vulnerability MDS. As such, we need to explicitly call that here.
         * The other idle methods do this automatically as part of the
         * implementation of i86_mwait().
         */
        x86_md_clear();
        (void) cpu_acpi_read_port(address, &value, 8);
        if (need_stpclk_workaround) {
                acpica_get_global_FADT(&gbl_FADT);
                (void) cpu_acpi_read_port(
                    gbl_FADT->XPmTimerBlock.Address,
                    &value, 32);
        }
}

/*
 * enter deep c-state handler
 */
static void
acpi_cpu_cstate(cpu_acpi_cstate_t *cstate)
{
        volatile uint32_t       *mcpu_mwait = CPU->cpu_m.mcpu_mwait;
        uint32_t                mwait_idle_state;
        cpu_t                   *cpup = CPU;
        processorid_t           cpu_sid = cpup->cpu_seqid;
        cpupart_t               *cp = cpup->cpu_part;
        hrtime_t                lapic_expire;
        uint8_t                 type = cstate->cs_addrspace_id;
        uint32_t                cs_type = cstate->cs_type;
        int                     hset_update = 1;
        boolean_t               using_timer;
        cpu_idle_check_wakeup_t check_func = &acpi_cpu_check_wakeup;

        /*
         * Set our mcpu_mwait here, so we can tell if anyone tries to
         * wake us between now and when we call mwait.  No other cpu will
         * attempt to set our mcpu_mwait until we add ourself to the haltset.
         */
        if (mcpu_mwait != NULL) {
                if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
                        mwait_idle_state = MWAIT_WAKEUP_IPI;
                        check_func = &acpi_cpu_mwait_ipi_check_wakeup;
                } else {
                        mwait_idle_state = MWAIT_HALTED;
                        check_func = &acpi_cpu_mwait_check_wakeup;
                }
                *mcpu_mwait = mwait_idle_state;
        } else {
                /*
                 * Initialize mwait_idle_state, but with mcpu_mwait NULL we'll
                 * never actually use it here. "MWAIT_RUNNING" just
                 * distinguishes from the "WAKEUP_IPI" and "HALTED" cases above.
                 */
                mwait_idle_state = MWAIT_RUNNING;
        }

        /*
         * If this CPU is online, and there are multiple CPUs
         * in the system, then we should note our halting
         * by adding ourselves to the partition's halted CPU
         * bitmap. This allows other CPUs to find/awaken us when
         * work becomes available.
         */
        if (cpup->cpu_flags & CPU_OFFLINE || ncpus == 1)
                hset_update = 0;

        /*
         * Add ourselves to the partition's halted CPUs bitmask
         * and set our HALTED flag, if necessary.
         *
         * When a thread becomes runnable, it is placed on the queue
         * and then the halted cpuset is checked to determine who
         * (if anyone) should be awakened. We therefore need to first
         * add ourselves to the halted cpuset, and and then check if there
         * is any work available.
         *
         * Note that memory barriers after updating the HALTED flag
         * are not necessary since an atomic operation (updating the bitmap)
         * immediately follows. On x86 the atomic operation acts as a
         * memory barrier for the update of cpu_disp_flags.
         */
        if (hset_update) {
                cpup->cpu_disp_flags |= CPU_DISP_HALTED;
                bitset_atomic_add(&cp->cp_haltset, cpu_sid);
        }

        /*
         * Check to make sure there's really nothing to do.  Work destined for
         * this CPU may become available after this check. If we're
         * mwait-halting we'll be notified through the clearing of our bit in
         * the halted CPU bitmask, and a write to our mcpu_mwait.  Otherwise,
         * we're hlt-based halting, and we'll be immediately woken by the
         * pending interrupt.
         *
         * disp_anywork() checks disp_nrunnable, so we do not have to later.
         */
        if (disp_anywork()) {
                if (hset_update) {
                        cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                        bitset_atomic_del(&cp->cp_haltset, cpu_sid);
                }
                return;
        }

        /*
         * We're on our way to being halted.
         *
         * The local APIC timer can stop in ACPI C2 and deeper c-states.
         * Try to program the HPET hardware to substitute for this CPU's
         * LAPIC timer.
         * cstate_use_timer() could disable the LAPIC Timer.  Make sure
         * to start the LAPIC Timer again before leaving this function.
         *
         * Disable interrupts here so we will awaken immediately after halting
         * if someone tries to poke us between now and the time we actually
         * halt.
         */
        cli();
        using_timer = cstate_use_timer(&lapic_expire, CSTATE_USING_HPET);

        /*
         * We check for the presence of our bit after disabling interrupts.
         * If it's cleared, we'll return. If the bit is cleared after
         * we check then the cstate_wakeup() will pop us out of the halted
         * state.
         *
         * This means that the ordering of the cstate_wakeup() and the clearing
         * of the bit by cpu_wakeup is important.
         * cpu_wakeup() must clear our mc_haltset bit, and then call
         * cstate_wakeup().
         * acpi_cpu_cstate() must disable interrupts, then check for the bit.
         */
        if (hset_update && bitset_in_set(&cp->cp_haltset, cpu_sid) == 0) {
                (void) cstate_use_timer(&lapic_expire,
                    CSTATE_USING_LAT);
                sti();
                cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                return;
        }

        /*
         * The check for anything locally runnable is here for performance
         * and isn't needed for correctness. disp_nrunnable ought to be
         * in our cache still, so it's inexpensive to check, and if there
         * is anything runnable we won't have to wait for the poke.
         */
        if (cpup->cpu_disp->disp_nrunnable != 0) {
                (void) cstate_use_timer(&lapic_expire,
                    CSTATE_USING_LAT);
                sti();
                if (hset_update) {
                        cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                        bitset_atomic_del(&cp->cp_haltset, cpu_sid);
                }
                return;
        }

        if (using_timer == B_FALSE) {

                (void) cstate_use_timer(&lapic_expire,
                    CSTATE_USING_LAT);
                sti();

                /*
                 * We are currently unable to program the HPET to act as this
                 * CPU's proxy LAPIC timer.  This CPU cannot enter C2 or deeper
                 * because no timer is set to wake it up while its LAPIC timer
                 * stalls in deep C-States.
                 * Enter C1 instead.
                 *
                 * cstate_wakeup() will wake this CPU with an IPI, which works
                 * with either MWAIT or HLT.
                 */
                if (mcpu_mwait != NULL) {
                        i86_monitor(mcpu_mwait, 0, 0);
                        if (*mcpu_mwait == MWAIT_HALTED) {
                                if (cpu_idle_enter(IDLE_STATE_C1, 0,
                                    check_func, (void *)mcpu_mwait) == 0) {
                                        if (*mcpu_mwait == MWAIT_HALTED) {
                                                i86_mwait(0, 0);
                                        }
                                        cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
                                }
                        }
                } else {
                        if (cpu_idle_enter(cs_type, 0, check_func, NULL) == 0) {
                                mach_cpu_idle();
                                cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
                        }
                }

                /*
                 * We're no longer halted
                 */
                if (hset_update) {
                        cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                        bitset_atomic_del(&cp->cp_haltset, cpu_sid);
                }
                return;
        }

        /*
         * Tell the cpu idle framework we're going to try idling.
         *
         * If cpu_idle_enter returns nonzero, we've found out at the last minute
         * that we don't actually want to idle.
         */
        boolean_t idle_ok = cpu_idle_enter(cs_type, 0, check_func,
            (void *)mcpu_mwait) == 0;

        if (idle_ok) {
                if (type == ACPI_ADR_SPACE_FIXED_HARDWARE) {
                        if (mcpu_mwait != NULL) {
                                /*
                                 * We're on our way to being halted.
                                 * To avoid a lost wakeup, arm the monitor
                                 * before checking if another cpu wrote to
                                 * mcpu_mwait to wake us up.
                                 */
                                i86_monitor(mcpu_mwait, 0, 0);
                                if (*mcpu_mwait == mwait_idle_state) {
                                        i86_mwait(cstate->cs_address, 1);
                                }
                        } else {
                                mach_cpu_idle();
                        }
                } else if (type == ACPI_ADR_SPACE_SYSTEM_IO) {
                        /*
                         * mcpu_mwait is not directly part of idling or wakeup
                         * in the ACPI System I/O case, but if available it can
                         * hint that we shouldn't actually try to idle because
                         * we're about to be woken up anyway.
                         *
                         * A trip through idle/wakeup can be upwards of a few
                         * microseconds, so avoiding that makes this a helpful
                         * optimization, but consulting mcpu_mwait is still not
                         * necessary for correctness here.
                         */
                        if (!mcpu_mwait || *mcpu_mwait == mwait_idle_state) {
                                acpi_io_idle(cstate->cs_address);
                        }
                }

                /*
                 * We've either idled and woken up, or decided not to idle.
                 * Either way, tell the cpu idle framework that we're not trying
                 * to idle anymore.
                 */
                cpu_idle_exit(CPU_IDLE_CB_FLAG_IDLE);
        }

        /*
         * The LAPIC timer may have stopped in deep c-state.
         * Reprogram this CPU's LAPIC here before enabling interrupts.
         */
        (void) cstate_use_timer(&lapic_expire, CSTATE_USING_LAT);
        sti();

        /*
         * We're no longer halted
         */
        if (hset_update) {
                cpup->cpu_disp_flags &= ~CPU_DISP_HALTED;
                bitset_atomic_del(&cp->cp_haltset, cpu_sid);
        }
}

/*
 * Idle the present CPU, deep c-state is supported
 */
void
cpu_acpi_idle(void)
{
        cpu_t *cp = CPU;
        cpu_acpi_handle_t handle;
        cma_c_state_t *cs_data;
        cpu_acpi_cstate_t *cstates;
        hrtime_t start, end;
        int cpu_max_cstates;
        uint32_t cs_indx;
        uint16_t cs_type;

        cpupm_mach_state_t *mach_state =
            (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
        handle = mach_state->ms_acpi_handle;
        ASSERT(CPU_ACPI_CSTATES(handle) != NULL);

        cs_data = mach_state->ms_cstate.cma_state.cstate;
        cstates = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
        ASSERT(cstates != NULL);
        cpu_max_cstates = cpu_acpi_get_max_cstates(handle);
        if (cpu_max_cstates > CPU_MAX_CSTATES)
                cpu_max_cstates = CPU_MAX_CSTATES;
        if (cpu_max_cstates == 1) {     /* no ACPI c-state data */
                (*non_deep_idle_cpu)();
                return;
        }

        start = gethrtime_unscaled();

        cs_indx = cpupm_next_cstate(cs_data, cstates, cpu_max_cstates, start);

        cs_type = cstates[cs_indx].cs_type;

        switch (cs_type) {
        default:
                /* FALLTHROUGH */
        case CPU_ACPI_C1:
                (*non_deep_idle_cpu)();
                break;

        case CPU_ACPI_C2:
                acpi_cpu_cstate(&cstates[cs_indx]);
                break;

        case CPU_ACPI_C3:
                /*
                 * All supported Intel processors maintain cache coherency
                 * during C3.  Currently when entering C3 processors flush
                 * core caches to higher level shared cache. The shared cache
                 * maintains state and supports probes during C3.
                 * Consequently there is no need to handle cache coherency
                 * and Bus Master activity here with the cache flush, BM_RLD
                 * bit, BM_STS bit, nor PM2_CNT.ARB_DIS mechanisms described
                 * in section 8.1.4 of the ACPI Specification 4.0.
                 */
                acpi_cpu_cstate(&cstates[cs_indx]);
                break;
        }

        end = gethrtime_unscaled();

        /*
         * Update statistics
         */
        cpupm_wakeup_cstate_data(cs_data, end);
}

boolean_t
cpu_deep_cstates_supported(void)
{
        extern int      idle_cpu_no_deep_c;

        if (idle_cpu_no_deep_c)
                return (B_FALSE);

        if (!cpuid_deep_cstates_supported())
                return (B_FALSE);

        if (cpuid_arat_supported()) {
                cpu_cstate_arat = B_TRUE;
                return (B_TRUE);
        }

        /*
         * In theory we can use the HPET as a proxy timer in case we can't rely
         * on the LAPIC in deep C-states. In practice on AMD it seems something
         * isn't quite right and we just don't get woken up, so the proxy timer
         * approach doesn't work. Only set up the HPET as proxy timer on Intel
         * systems for now.
         */
        if (cpuid_getvendor(CPU) == X86_VENDOR_Intel &&
            (hpet.supported == HPET_FULL_SUPPORT) &&
            hpet.install_proxy()) {
                cpu_cstate_hpet = B_TRUE;
                return (B_TRUE);
        }

        return (B_FALSE);
}

/*
 * Validate that this processor supports deep cstate and if so,
 * get the c-state data from ACPI and cache it.
 */
static int
cpu_idle_init(cpu_t *cp)
{
        cpupm_mach_state_t *mach_state =
            (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
        cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
        cpu_acpi_cstate_t *cstate;
        char name[KSTAT_STRLEN];
        int cpu_max_cstates, i;
        int ret;

        /*
         * Cache the C-state specific ACPI data.
         */
        if ((ret = cpu_acpi_cache_cstate_data(handle)) != 0) {
                if (ret < 0)
                        cmn_err(CE_NOTE,
                            "!Support for CPU deep idle states is being "
                            "disabled due to errors parsing ACPI C-state "
                            "objects exported by BIOS.");
                cpu_idle_fini(cp);
                return (-1);
        }

        cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);

        cpu_max_cstates = cpu_acpi_get_max_cstates(handle);

        for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
                (void) snprintf(name, KSTAT_STRLEN - 1, "c%d", cstate->cs_type);
                /*
                 * Allocate, initialize and install cstate kstat
                 */
                cstate->cs_ksp = kstat_create("cstate", cp->cpu_id,
                    name, "misc",
                    KSTAT_TYPE_NAMED,
                    sizeof (cpu_idle_kstat) / sizeof (kstat_named_t),
                    KSTAT_FLAG_VIRTUAL);

                if (cstate->cs_ksp == NULL) {
                        cmn_err(CE_NOTE, "kstat_create(c_state) fail");
                } else {
                        cstate->cs_ksp->ks_data = &cpu_idle_kstat;
                        cstate->cs_ksp->ks_lock = &cpu_idle_mutex;
                        cstate->cs_ksp->ks_update = cpu_idle_kstat_update;
                        cstate->cs_ksp->ks_data_size += MAXNAMELEN;
                        cstate->cs_ksp->ks_private = cstate;
                        kstat_install(cstate->cs_ksp);
                }
                cstate++;
        }

        cpupm_alloc_domains(cp, CPUPM_C_STATES);
        cpupm_alloc_ms_cstate(cp);

        if (cpu_deep_cstates_supported()) {
                uint32_t value;

                mutex_enter(&cpu_idle_callb_mutex);
                if (cpu_deep_idle_callb_id == (callb_id_t)0)
                        cpu_deep_idle_callb_id = callb_add(&cpu_deep_idle_callb,
                            (void *)NULL, CB_CL_CPU_DEEP_IDLE, "cpu_deep_idle");
                if (cpu_idle_cpr_callb_id == (callb_id_t)0)
                        cpu_idle_cpr_callb_id = callb_add(&cpu_idle_cpr_callb,
                            (void *)NULL, CB_CL_CPR_PM, "cpu_idle_cpr");
                mutex_exit(&cpu_idle_callb_mutex);


                /*
                 * All supported CPUs (Nehalem and later) will remain in C3
                 * during Bus Master activity.
                 * All CPUs set ACPI_BITREG_BUS_MASTER_RLD to 0 here if it
                 * is not already 0 before enabling Deeper C-states.
                 */
                cpu_acpi_get_register(ACPI_BITREG_BUS_MASTER_RLD, &value);
                if (value & 1)
                        cpu_acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
        }

        return (0);
}

/*
 * Free resources allocated by cpu_idle_init().
 */
static void
cpu_idle_fini(cpu_t *cp)
{
        cpupm_mach_state_t *mach_state =
            (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
        cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
        cpu_acpi_cstate_t *cstate;
        uint_t  cpu_max_cstates, i;

        /*
         * idle cpu points back to the generic one
         */
        idle_cpu = cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
        disp_enq_thread = non_deep_idle_disp_enq_thread;

        cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
        if (cstate) {
                cpu_max_cstates = cpu_acpi_get_max_cstates(handle);

                for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
                        if (cstate->cs_ksp != NULL)
                                kstat_delete(cstate->cs_ksp);
                        cstate++;
                }
        }

        cpupm_free_ms_cstate(cp);
        cpupm_free_domains(&cpupm_cstate_domains);
        cpu_acpi_free_cstate_data(handle);

        mutex_enter(&cpu_idle_callb_mutex);
        if (cpu_deep_idle_callb_id != (callb_id_t)0) {
                (void) callb_delete(cpu_deep_idle_callb_id);
                cpu_deep_idle_callb_id = (callb_id_t)0;
        }
        if (cpu_idle_cpr_callb_id != (callb_id_t)0) {
                (void) callb_delete(cpu_idle_cpr_callb_id);
                cpu_idle_cpr_callb_id = (callb_id_t)0;
        }
        mutex_exit(&cpu_idle_callb_mutex);
}

/*
 * This function is introduced here to solve a race condition
 * between the master and the slave to touch c-state data structure.
 * After the slave calls this idle function to switch to the non
 * deep idle function, the master can go on to reclaim the resource.
 */
static void
cpu_idle_stop_sync(void)
{
        /* switch to the non deep idle function */
        CPU->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
}

static void
cpu_idle_stop(cpu_t *cp)
{
        cpupm_mach_state_t *mach_state =
            (cpupm_mach_state_t *)(cp->cpu_m.mcpu_pm_mach_state);
        cpu_acpi_handle_t handle = mach_state->ms_acpi_handle;
        cpu_acpi_cstate_t *cstate;
        uint_t cpu_max_cstates, i = 0;

        mutex_enter(&cpu_idle_callb_mutex);
        if (idle_cpu == cpu_idle_adaptive) {
                /*
                 * invoke the slave to call synchronous idle function.
                 */
                cp->cpu_m.mcpu_idle_cpu = cpu_idle_stop_sync;
                poke_cpu(cp->cpu_id);

                /*
                 * wait until the slave switchs to non deep idle function,
                 * so that the master is safe to go on to reclaim the resource.
                 */
                while (cp->cpu_m.mcpu_idle_cpu != non_deep_idle_cpu) {
                        drv_usecwait(10);
                        if ((++i % CPU_IDLE_STOP_TIMEOUT) == 0)
                                cmn_err(CE_NOTE, "!cpu_idle_stop: the slave"
                                    " idle stop timeout");
                }
        }
        mutex_exit(&cpu_idle_callb_mutex);

        cstate = (cpu_acpi_cstate_t *)CPU_ACPI_CSTATES(handle);
        if (cstate) {
                cpu_max_cstates = cpu_acpi_get_max_cstates(handle);

                for (i = CPU_ACPI_C1; i <= cpu_max_cstates; i++) {
                        if (cstate->cs_ksp != NULL)
                                kstat_delete(cstate->cs_ksp);
                        cstate++;
                }
        }
        cpupm_free_ms_cstate(cp);
        cpupm_remove_domains(cp, CPUPM_C_STATES, &cpupm_cstate_domains);
        cpu_acpi_free_cstate_data(handle);
}

/*ARGSUSED*/
static boolean_t
cpu_deep_idle_callb(void *arg, int code)
{
        boolean_t rslt = B_TRUE;

        mutex_enter(&cpu_idle_callb_mutex);
        switch (code) {
        case PM_DEFAULT_CPU_DEEP_IDLE:
                /*
                 * Default policy is same as enable
                 */
                /*FALLTHROUGH*/
        case PM_ENABLE_CPU_DEEP_IDLE:
                if ((cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG) == 0)
                        break;

                if (cstate_timer_callback(PM_ENABLE_CPU_DEEP_IDLE)) {
                        disp_enq_thread = cstate_wakeup;
                        idle_cpu = cpu_idle_adaptive;
                        cpu_idle_cfg_state &= ~CPU_IDLE_DEEP_CFG;
                } else {
                        rslt = B_FALSE;
                }
                break;

        case PM_DISABLE_CPU_DEEP_IDLE:
                if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
                        break;

                idle_cpu = non_deep_idle_cpu;
                if (cstate_timer_callback(PM_DISABLE_CPU_DEEP_IDLE)) {
                        disp_enq_thread = non_deep_idle_disp_enq_thread;
                        cpu_idle_cfg_state |= CPU_IDLE_DEEP_CFG;
                }
                break;

        default:
                cmn_err(CE_NOTE, "!cpu deep_idle_callb: invalid code %d\n",
                    code);
                break;
        }
        mutex_exit(&cpu_idle_callb_mutex);
        return (rslt);
}

/*ARGSUSED*/
static boolean_t
cpu_idle_cpr_callb(void *arg, int code)
{
        boolean_t rslt = B_TRUE;

        mutex_enter(&cpu_idle_callb_mutex);
        switch (code) {
        case CB_CODE_CPR_RESUME:
                if (cstate_timer_callback(CB_CODE_CPR_RESUME)) {
                        /*
                         * Do not enable dispatcher hooks if disabled by user.
                         */
                        if (cpu_idle_cfg_state & CPU_IDLE_DEEP_CFG)
                                break;

                        disp_enq_thread = cstate_wakeup;
                        idle_cpu = cpu_idle_adaptive;
                } else {
                        rslt = B_FALSE;
                }
                break;

        case CB_CODE_CPR_CHKPT:
                idle_cpu = non_deep_idle_cpu;
                disp_enq_thread = non_deep_idle_disp_enq_thread;
                (void) cstate_timer_callback(CB_CODE_CPR_CHKPT);
                break;

        default:
                cmn_err(CE_NOTE, "!cpudvr cpr_callb: invalid code %d\n", code);
                break;
        }
        mutex_exit(&cpu_idle_callb_mutex);
        return (rslt);
}

/*
 * handle _CST notification
 */
void
cpuidle_cstate_instance(cpu_t *cp)
{
#ifndef __xpv
        cpupm_mach_state_t      *mach_state =
            (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
        cpu_acpi_handle_t       handle;
        struct machcpu          *mcpu;
        cpuset_t                dom_cpu_set;
        kmutex_t                *pm_lock;
        int                     result = 0;
        processorid_t           cpu_id;

        if (mach_state == NULL) {
                return;
        }

        ASSERT(mach_state->ms_cstate.cma_domain != NULL);
        dom_cpu_set = mach_state->ms_cstate.cma_domain->pm_cpus;
        pm_lock = &mach_state->ms_cstate.cma_domain->pm_lock;

        /*
         * Do for all the CPU's in the domain
         */
        mutex_enter(pm_lock);
        do {
                CPUSET_FIND(dom_cpu_set, cpu_id);
                if (cpu_id == CPUSET_NOTINSET)
                        break;

                ASSERT(cpu_id >= 0 && cpu_id < NCPU);
                cp = cpu[cpu_id];
                mach_state = (cpupm_mach_state_t *)
                    cp->cpu_m.mcpu_pm_mach_state;
                if (!(mach_state->ms_caps & CPUPM_C_STATES)) {
                        mutex_exit(pm_lock);
                        return;
                }
                handle = mach_state->ms_acpi_handle;
                ASSERT(handle != NULL);

                /*
                 * re-evaluate cstate object
                 */
                if (cpu_acpi_cache_cstate_data(handle) != 0) {
                        cmn_err(CE_WARN, "Cannot re-evaluate the cpu c-state"
                            " object Instance: %d", cpu_id);
                }
                mcpu = &(cp->cpu_m);
                mcpu->max_cstates = cpu_acpi_get_max_cstates(handle);
                if (mcpu->max_cstates > CPU_ACPI_C1) {
                        (void) cstate_timer_callback(
                            CST_EVENT_MULTIPLE_CSTATES);
                        disp_enq_thread = cstate_wakeup;
                        cp->cpu_m.mcpu_idle_cpu = cpu_acpi_idle;
                } else if (mcpu->max_cstates == CPU_ACPI_C1) {
                        disp_enq_thread = non_deep_idle_disp_enq_thread;
                        cp->cpu_m.mcpu_idle_cpu = non_deep_idle_cpu;
                        (void) cstate_timer_callback(CST_EVENT_ONE_CSTATE);
                }

                CPUSET_ATOMIC_XDEL(dom_cpu_set, cpu_id, result);
        } while (result < 0);
        mutex_exit(pm_lock);
#endif
}

/*
 * handle the number or the type of available processor power states change
 */
void
cpuidle_manage_cstates(void *ctx)
{
        cpu_t                   *cp = ctx;
        cpupm_mach_state_t      *mach_state =
            (cpupm_mach_state_t *)cp->cpu_m.mcpu_pm_mach_state;
        boolean_t               is_ready;

        if (mach_state == NULL) {
                return;
        }

        /*
         * We currently refuse to power manage if the CPU is not ready to
         * take cross calls (cross calls fail silently if CPU is not ready
         * for it).
         *
         * Additionally, for x86 platforms we cannot power manage an instance,
         * until it has been initialized.
         */
        is_ready = (cp->cpu_flags & CPU_READY) && cpupm_cstate_ready(cp);
        if (!is_ready)
                return;

        cpuidle_cstate_instance(cp);
}