root/usr/src/uts/i86pc/io/fipe/fipe_pm.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2009, Intel Corporation.
 * All rights reserved.
 */

#include <sys/atomic.h>
#include <sys/cpuvar.h>
#include <sys/cpu.h>
#include <sys/cpu_event.h>
#include <sys/cmn_err.h>
#include <sys/ddi.h>
#include <sys/kmem.h>
#include <sys/kstat.h>
#include <sys/pci.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/synch.h>
#include <sys/sysmacros.h>
#include <sys/fipe.h>
#include <vm/hat.h>

/* Current PM policy, configurable through /etc/system and fipe.conf. */
fipe_pm_policy_t fipe_pm_policy = FIPE_PM_POLICY_BALANCE;
int fipe_pm_throttle_level = 1;

/* Enable kstat support. */
#define FIPE_KSTAT_SUPPORT              1

/* Enable performance relative statistics. */
#define FIPE_KSTAT_DETAIL               1

/* Enable builtin IOAT driver if no IOAT driver is available. */
#define FIPE_IOAT_BUILTIN               0
#if defined(FIPE_IOAT_BUILTIN) && (FIPE_IOAT_BUILTIN == 0)
#undef  FIPE_IOAT_BUILTIN
#endif

#ifdef  FIPE_IOAT_BUILTIN
/* Use IOAT channel 3 to generate memory transactions. */
#define FIPE_IOAT_CHAN_CTRL             0x200
#define FIPE_IOAT_CHAN_STS_LO           0x204
#define FIPE_IOAT_CHAN_STS_HI           0x208
#define FIPE_IOAT_CHAN_ADDR_LO          0x20C
#define FIPE_IOAT_CHAN_ADDR_HI          0x210
#define FIPE_IOAT_CHAN_CMD              0x214
#define FIPE_IOAT_CHAN_ERR              0x228
#else   /* FIPE_IOAT_BUILTIN */
#include <sys/dcopy.h>
#endif  /* FIPE_IOAT_BUILTIN */

/* Memory controller relative PCI configuration constants. */
#define FIPE_MC_GBLACT                  0x60
#define FIPE_MC_THRTLOW                 0x64
#define FIPE_MC_THRTCTRL                0x67
#define FIPE_MC_THRTCTRL_HUNT           0x1

/* Hardware recommended values. */
#define FIPE_MC_MEMORY_OFFSET           1024
#define FIPE_MC_MEMORY_SIZE             128

/* Number of IOAT commands posted when entering idle. */
#define FIPE_IOAT_CMD_NUM               2

/* Resource allocation retry interval in microsecond. */
#define FIPE_IOAT_RETRY_INTERVAL        (15 * 1000 * 1000)

/* Statistics update interval in nanosecond. */
#define FIPE_STAT_INTERVAL              (10 * 1000 * 1000)

/* Configuration profile support. */
#define FIPE_PROFILE_FIELD(field)       (fipe_profile_curr->field)
#define FIPE_PROF_IDLE_COUNT            FIPE_PROFILE_FIELD(idle_count)
#define FIPE_PROF_BUSY_THRESHOLD        FIPE_PROFILE_FIELD(busy_threshold)
#define FIPE_PROF_INTR_THRESHOLD        FIPE_PROFILE_FIELD(intr_threshold)
#define FIPE_PROF_INTR_BUSY_THRESHOLD   FIPE_PROFILE_FIELD(intr_busy_threshold)
#define FIPE_PROF_INTR_BUSY_THROTTLE    FIPE_PROFILE_FIELD(intr_busy_throttle)

/* Priority assigned to FIPE memory power management driver on x86. */
#define CPU_IDLE_CB_PRIO_FIPE           (CPU_IDLE_CB_PRIO_LOW_BASE + 0x4000000)

/* Structure to support power management profile. */
#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_profiles)
static struct fipe_profile {
        uint32_t                        idle_count;
        uint32_t                        busy_threshold;
        uint32_t                        intr_threshold;
        uint32_t                        intr_busy_threshold;
        uint32_t                        intr_busy_throttle;
} fipe_profiles[FIPE_PM_POLICY_MAX] = {
        { 0,    0,      0,      0,      0 },
        { 5,    30,     20,     50,     5 },
        { 10,   40,     40,     75,     4 },
        { 15,   50,     60,     100,    2 },
};

/* Structure to store memory controller relative data. */
#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_mc_ctrl)
static struct fipe_mc_ctrl {
        ddi_acc_handle_t                mc_pci_hdl;
        unsigned char                   mc_thrtctrl;
        unsigned char                   mc_thrtlow;
        unsigned char                   mc_gblact;
        dev_info_t                      *mc_dip;
        boolean_t                       mc_initialized;
} fipe_mc_ctrl;

/* Structure to store IOAT relative information. */
#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_ioat_ctrl)
static struct fipe_ioat_control {
        kmutex_t                        ioat_lock;
        boolean_t                       ioat_ready;
#ifdef  FIPE_IOAT_BUILTIN
        boolean_t                       ioat_reg_mapped;
        ddi_acc_handle_t                ioat_reg_handle;
        uint8_t                         *ioat_reg_addr;
        uint64_t                        ioat_cmd_physaddr;
#else   /* FIPE_IOAT_BUILTIN */
        dcopy_cmd_t                     ioat_cmds[FIPE_IOAT_CMD_NUM + 1];
        dcopy_handle_t                  ioat_handle;
#endif  /* FIPE_IOAT_BUILTIN */
        dev_info_t                      *ioat_dev_info;
        uint64_t                        ioat_buf_physaddr;
        char                            *ioat_buf_virtaddr;
        char                            *ioat_buf_start;
        size_t                          ioat_buf_size;
        timeout_id_t                    ioat_timerid;
        boolean_t                       ioat_failed;
        boolean_t                       ioat_cancel;
        boolean_t                       ioat_try_alloc;
} fipe_ioat_ctrl;

#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_idle_ctrl)
static struct fipe_idle_ctrl {
        boolean_t                       idle_ready;
        cpu_idle_callback_handle_t      cb_handle;
        cpu_idle_prop_handle_t          prop_enter;
        cpu_idle_prop_handle_t          prop_exit;
        cpu_idle_prop_handle_t          prop_busy;
        cpu_idle_prop_handle_t          prop_idle;
        cpu_idle_prop_handle_t          prop_intr;

        /* Put here for cache efficiency, it should be in fipe_global_ctrl. */
        hrtime_t                        tick_interval;
} fipe_idle_ctrl;

/*
 * Global control structure.
 * Solaris idle thread has no reentrance issue, so it's enough to count CPUs
 * in idle state. Otherwise cpuset_t bitmap should be used to track idle CPUs.
 */
#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_gbl_ctrl)
static struct fipe_global_ctrl {
        kmutex_t                        lock;
        boolean_t                       pm_enabled;
        volatile boolean_t              pm_active;
        volatile uint32_t               cpu_count;
        volatile uint64_t               io_waiters;
        hrtime_t                        enter_ts;
        hrtime_t                        time_in_pm;
        size_t                          state_size;
        char                            *state_buf;
#ifdef  FIPE_KSTAT_SUPPORT
        kstat_t                         *fipe_kstat;
#endif  /* FIPE_KSTAT_SUPPORT */
} fipe_gbl_ctrl;

#define FIPE_CPU_STATE_PAD              (128 - \
        2 * sizeof (boolean_t) -  4 * sizeof (hrtime_t) - \
        2 * sizeof (uint64_t) - 2 * sizeof (uint32_t))

/* Per-CPU status. */
#pragma pack(1)
typedef struct fipe_cpu_state {
        boolean_t                       cond_ready;
        boolean_t                       state_ready;
        uint32_t                        idle_count;
        uint32_t                        throttle_cnt;
        hrtime_t                        throttle_ts;
        hrtime_t                        next_ts;
        hrtime_t                        last_busy;
        hrtime_t                        last_idle;
        uint64_t                        last_intr;
        uint64_t                        last_iowait;
        char                            pad1[FIPE_CPU_STATE_PAD];
} fipe_cpu_state_t;
#pragma pack()

#ifdef  FIPE_KSTAT_SUPPORT
#pragma align CPU_CACHE_COHERENCE_SIZE(fipe_kstat)
static struct fipe_kstat_s {
        kstat_named_t           fipe_enabled;
        kstat_named_t           fipe_policy;
        kstat_named_t           fipe_pm_time;
#ifdef  FIPE_KSTAT_DETAIL
        kstat_named_t           ioat_ready;
        kstat_named_t           pm_tryenter_cnt;
        kstat_named_t           pm_success_cnt;
        kstat_named_t           pm_race_cnt;
        kstat_named_t           cpu_loop_cnt;
        kstat_named_t           cpu_busy_cnt;
        kstat_named_t           cpu_idle_cnt;
        kstat_named_t           cpu_intr_busy_cnt;
        kstat_named_t           cpu_intr_throttle_cnt;
        kstat_named_t           bio_busy_cnt;
        kstat_named_t           ioat_start_fail_cnt;
        kstat_named_t           ioat_stop_fail_cnt;
#endif  /* FIPE_KSTAT_DETAIL */
} fipe_kstat = {
        { "fipe_enabled",       KSTAT_DATA_INT32 },
        { "fipe_policy",        KSTAT_DATA_INT32 },
        { "fipe_pm_time",       KSTAT_DATA_UINT64 },
#ifdef  FIPE_KSTAT_DETAIL
        { "ioat_ready",         KSTAT_DATA_INT32 },
        { "pm_tryenter_cnt",    KSTAT_DATA_UINT64 },
        { "pm_success_cnt",     KSTAT_DATA_UINT64 },
        { "pm_race_cnt",        KSTAT_DATA_UINT64 },
        { "cpu_loop_cnt",       KSTAT_DATA_UINT64 },
        { "cpu_busy_cnt",       KSTAT_DATA_UINT64 },
        { "cpu_idle_cnt",       KSTAT_DATA_UINT64 },
        { "cpu_intr_busy_cnt",  KSTAT_DATA_UINT64 },
        { "cpu_intr_thrt_cnt",  KSTAT_DATA_UINT64 },
        { "bio_busy_cnt",       KSTAT_DATA_UINT64 },
        { "ioat_start_fail_cnt", KSTAT_DATA_UINT64 },
        { "ioat_stop_fail_cnt", KSTAT_DATA_UINT64 }
#endif  /* FIPE_KSTAT_DETAIL */
};

#define FIPE_KSTAT_INC(v)               \
        atomic_inc_64(&fipe_kstat.v.value.ui64)
#ifdef  FIPE_KSTAT_DETAIL
#define FIPE_KSTAT_DETAIL_INC(v)        \
        atomic_inc_64(&fipe_kstat.v.value.ui64)
#else   /* FIPE_KSTAT_DETAIL */
#define FIPE_KSTAT_DETAIL_INC(v)
#endif  /* FIPE_KSTAT_DETAIL */

#else   /* FIPE_KSTAT_SUPPORT */

#define FIPE_KSTAT_INC(v)
#define FIPE_KSTAT_DETAIL_INC(v)

#endif  /* FIPE_KSTAT_SUPPORT */

/* Save current power management profile during suspend/resume. */
static fipe_pm_policy_t fipe_pm_policy_saved = FIPE_PM_POLICY_BALANCE;
static fipe_cpu_state_t *fipe_cpu_states = NULL;

/*
 * There is no lock to protect fipe_profile_curr, so fipe_profile_curr
 * could change on threads in fipe_idle_enter.  This is not an issue,
 * as it always points to a valid profile, and though it might make
 * an incorrect choice for the new profile, it will still be a valid
 * selection, and would do the correct operation for the new profile on
 * next cpu_idle_enter cycle.  Since the selections would always be
 * valid for some profile, the overhead for the lock is not wasted.
 */
static struct fipe_profile *fipe_profile_curr = NULL;

static void fipe_idle_enter(void *arg, cpu_idle_callback_context_t ctx,
    cpu_idle_check_wakeup_t check_func, void* check_arg);
static void fipe_idle_exit(void* arg, cpu_idle_callback_context_t ctx,
    int flags);
static cpu_idle_callback_t fipe_idle_cb = {
        CPU_IDLE_CALLBACK_VER0,
        fipe_idle_enter,
        fipe_idle_exit,
};

/*
 * Configure memory controller into power saving mode:
 * 1) OLTT activation limit is set to unlimited
 * 2) MC works in S-CLTT mode
 */
static int
fipe_mc_change(int throttle)
{
        /* Enable OLTT/disable S-CLTT mode */
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
            fipe_mc_ctrl.mc_thrtctrl & ~FIPE_MC_THRTCTRL_HUNT);
        /* Set OLTT activation limit to unlimited */
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_GBLACT, 0);
        /*
         * Set S-CLTT low throttling to desired value. The lower value,
         * the more power saving and the less available memory bandwidth.
         */
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTLOW, throttle);
        /* Enable S-CLTT/disable OLTT mode */
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
            fipe_mc_ctrl.mc_thrtctrl | FIPE_MC_THRTCTRL_HUNT);

        return (0);
}

/*
 * Restore memory controller's original configuration.
 */
static void
fipe_mc_restore(void)
{
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
            fipe_mc_ctrl.mc_thrtctrl & ~FIPE_MC_THRTCTRL_HUNT);
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_GBLACT,
            fipe_mc_ctrl.mc_gblact);
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTLOW,
            fipe_mc_ctrl.mc_thrtlow);
        pci_config_put8(fipe_mc_ctrl.mc_pci_hdl, FIPE_MC_THRTCTRL,
            fipe_mc_ctrl.mc_thrtctrl);
}

/*
 * Initialize memory controller's data structure and status.
 */
static int
fipe_mc_init(dev_info_t *dip)
{
        ddi_acc_handle_t handle;

        bzero(&fipe_mc_ctrl, sizeof (fipe_mc_ctrl));

        /* Hold one reference count and will be released in fipe_mc_fini. */
        ndi_hold_devi(dip);

        /* Setup pci configuration handler. */
        if (pci_config_setup(dip, &handle) != DDI_SUCCESS) {
                cmn_err(CE_WARN,
                    "!fipe: failed to setup pcicfg handler in mc_init.");
                ndi_rele_devi(dip);
                return (-1);
        }

        /* Save original configuration. */
        fipe_mc_ctrl.mc_thrtctrl = pci_config_get8(handle, FIPE_MC_THRTCTRL);
        fipe_mc_ctrl.mc_thrtlow = pci_config_get8(handle, FIPE_MC_THRTLOW);
        fipe_mc_ctrl.mc_gblact = pci_config_get8(handle, FIPE_MC_GBLACT);
        fipe_mc_ctrl.mc_dip = dip;
        fipe_mc_ctrl.mc_pci_hdl = handle;
        fipe_mc_ctrl.mc_initialized = B_TRUE;

        return (0);
}

/*
 * Restore memory controller's configuration and release resources.
 */
static void
fipe_mc_fini(void)
{
        if (fipe_mc_ctrl.mc_initialized) {
                fipe_mc_restore();
                pci_config_teardown(&fipe_mc_ctrl.mc_pci_hdl);
                ndi_rele_devi(fipe_mc_ctrl.mc_dip);
                fipe_mc_ctrl.mc_initialized = B_FALSE;
        }
        bzero(&fipe_mc_ctrl, sizeof (fipe_mc_ctrl));
}

/* Search device with specific pci ids. */
struct fipe_pci_ioat_id {
        uint16_t                venid;
        uint16_t                devid;
        uint16_t                subvenid;
        uint16_t                subsysid;
        char                    *unitaddr;
};

static struct fipe_pci_ioat_id fipe_pci_ioat_ids[] = {
        { 0x8086, 0x1a38, 0xffff, 0xffff, NULL },
        { 0x8086, 0x360b, 0xffff, 0xffff, NULL },
};

/*ARGSUSED*/
static int
fipe_search_ioat_dev(dev_info_t *dip, void *arg)
{
        char *unit;
        struct fipe_pci_ioat_id *id;
        int i, max, venid, devid, subvenid, subsysid;

        /* Query PCI id properties. */
        venid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
            "vendor-id", 0xffffffff);
        if (venid == 0xffffffff) {
                return (DDI_WALK_CONTINUE);
        }
        devid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
            "device-id", 0xffffffff);
        if (devid == 0xffffffff) {
                return (DDI_WALK_CONTINUE);
        }
        subvenid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
            "subsystem-vendor-id", 0xffffffff);
        if (subvenid == 0xffffffff) {
                return (DDI_WALK_CONTINUE);
        }
        subsysid = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
            "subsystem-id", 0xffffffff);
        if (subvenid == 0xffffffff) {
                return (DDI_WALK_CONTINUE);
        }
        if (ddi_prop_lookup_string(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS,
            "unit-address", &unit) != DDI_PROP_SUCCESS) {
                return (DDI_WALK_CONTINUE);
        }

        max = sizeof (fipe_pci_ioat_ids) / sizeof (fipe_pci_ioat_ids[0]);
        for (i = 0; i < max; i++) {
                id = &fipe_pci_ioat_ids[i];
                if ((id->venid == 0xffffu || id->venid == venid) &&
                    (id->devid == 0xffffu || id->devid == devid) &&
                    (id->subvenid == 0xffffu || id->subvenid == subvenid) &&
                    (id->subsysid == 0xffffu || id->subsysid == subsysid) &&
                    (id->unitaddr == NULL || strcmp(id->unitaddr, unit) == 0)) {
                        break;
                }
        }
        ddi_prop_free(unit);
        if (i >= max) {
                return (DDI_WALK_CONTINUE);
        }

        /* Found IOAT device, hold one reference count. */
        ndi_hold_devi(dip);
        fipe_ioat_ctrl.ioat_dev_info = dip;

        return (DDI_WALK_TERMINATE);
}

/*
 * To enable FBDIMM idle power enhancement mechanism, IOAT will be used to
 * generate enough memory traffic to trigger memory controller thermal throttle
 * circuitry.
 * If dcopy/ioat is available, we will use dcopy interface to communicate
 * with IOAT. Otherwise the built-in driver will directly talk to IOAT
 * hardware.
 */
#ifdef  FIPE_IOAT_BUILTIN
static int
fipe_ioat_trigger(void)
{
        uint16_t ctrl;
        uint32_t err;
        uint8_t *addr = fipe_ioat_ctrl.ioat_reg_addr;
        ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;

        /* Check channel in use flag. */
        ctrl = ddi_get16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL));
        if (ctrl & 0x100) {
                /*
                 * Channel is in use by somebody else. IOAT driver may have
                 * been loaded, forbid fipe from accessing IOAT hardware
                 * anymore.
                 */
                fipe_ioat_ctrl.ioat_ready = B_FALSE;
                fipe_ioat_ctrl.ioat_failed = B_TRUE;
                FIPE_KSTAT_INC(ioat_start_fail_cnt);
                return (-1);
        } else {
                /* Set channel in use flag. */
                ddi_put16(handle,
                    (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0x100);
        }

        /* Write command address. */
        ddi_put32(handle,
            (uint32_t *)(addr + FIPE_IOAT_CHAN_ADDR_LO),
            (uint32_t)fipe_ioat_ctrl.ioat_cmd_physaddr);
        ddi_put32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ADDR_HI),
            (uint32_t)(fipe_ioat_ctrl.ioat_cmd_physaddr >> 32));

        /* Check and clear error flags. */
        err = ddi_get32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ERR));
        if (err != 0) {
                ddi_put32(handle, (uint32_t *)(addr + FIPE_IOAT_CHAN_ERR), err);
        }

        /* Start channel. */
        ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x1);

        return (0);
}

static void
fipe_ioat_cancel(void)
{
        uint32_t status;
        uint8_t *addr = fipe_ioat_ctrl.ioat_reg_addr;
        ddi_acc_handle_t handle = fipe_ioat_ctrl.ioat_reg_handle;

        /*
         * Reset channel. Sometimes reset is not reliable,
         * so check completion or abort status after reset.
         */
        /* LINTED: constant in conditional context */
        while (1) {
                /* Issue reset channel command. */
                ddi_put8(handle, (uint8_t *)(addr + FIPE_IOAT_CHAN_CMD), 0x20);

                /* Query command status. */
                status = ddi_get32(handle,
                    (uint32_t *)(addr + FIPE_IOAT_CHAN_STS_LO));
                if (status & 0x1) {
                        /* Reset channel completed. */
                        break;
                } else {
                        SMT_PAUSE();
                }
        }

        /* Put channel into "not in use" state. */
        ddi_put16(handle, (uint16_t *)(addr + FIPE_IOAT_CHAN_CTRL), 0);
}

/*ARGSUSED*/
static void
fipe_ioat_alloc(void *arg)
{
        int rc = 0, nregs;
        dev_info_t *dip;
        ddi_device_acc_attr_t attr;
        boolean_t fatal = B_FALSE;

        mutex_enter(&fipe_ioat_ctrl.ioat_lock);
        /*
         * fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
         * In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
         * schedule a timer and exit.
         */
        if (fipe_ioat_ctrl.ioat_try_alloc == B_FALSE) {
                fipe_ioat_ctrl.ioat_try_alloc = B_TRUE;
                goto out_error;
        }

        /* Check whether has been initialized or encountered permanent error. */
        if (fipe_ioat_ctrl.ioat_ready || fipe_ioat_ctrl.ioat_failed ||
            fipe_ioat_ctrl.ioat_cancel) {
                fipe_ioat_ctrl.ioat_timerid = 0;
                mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                return;
        }

        if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
                /* Find dev_info_t for IOAT engine. */
                ddi_walk_devs(ddi_root_node(), fipe_search_ioat_dev, NULL);
                if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
                        cmn_err(CE_NOTE,
                            "!fipe: no IOAT hardware found, disable pm.");
                        fatal = B_TRUE;
                        goto out_error;
                }
        }

        /* Map in IOAT control register window. */
        ASSERT(fipe_ioat_ctrl.ioat_dev_info != NULL);
        ASSERT(fipe_ioat_ctrl.ioat_reg_mapped == B_FALSE);
        dip = fipe_ioat_ctrl.ioat_dev_info;
        if (ddi_dev_nregs(dip, &nregs) != DDI_SUCCESS || nregs < 2) {
                cmn_err(CE_WARN, "!fipe: ioat has not enough register bars.");
                fatal = B_TRUE;
                goto out_error;
        }
        attr.devacc_attr_version = DDI_DEVICE_ATTR_V0;
        attr.devacc_attr_endian_flags = DDI_NEVERSWAP_ACC;
        attr.devacc_attr_dataorder = DDI_STRICTORDER_ACC;
        rc = ddi_regs_map_setup(dip, 1,
            (caddr_t *)&fipe_ioat_ctrl.ioat_reg_addr,
            0, 0, &attr, &fipe_ioat_ctrl.ioat_reg_handle);
        if (rc != DDI_SUCCESS) {
                cmn_err(CE_WARN, "!fipe: failed to map IOAT registeres.");
                fatal = B_TRUE;
                goto out_error;
        }

        /* Mark IOAT status. */
        fipe_ioat_ctrl.ioat_reg_mapped = B_TRUE;
        fipe_ioat_ctrl.ioat_ready = B_TRUE;
        fipe_ioat_ctrl.ioat_failed = B_FALSE;
        fipe_ioat_ctrl.ioat_timerid = 0;
        mutex_exit(&fipe_ioat_ctrl.ioat_lock);

        return;

out_error:
        fipe_ioat_ctrl.ioat_timerid = 0;
        if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
                if (fatal) {
                        /* Mark permanent error and give up. */
                        fipe_ioat_ctrl.ioat_failed = B_TRUE;
                        /* Release reference count hold by ddi_find_devinfo. */
                        if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
                                ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
                                fipe_ioat_ctrl.ioat_dev_info = NULL;
                        }
                } else {
                        /*
                         * Schedule another timer to keep on trying.
                         * timeout() should always succeed, no need to check
                         * return.
                         */
                        fipe_ioat_ctrl.ioat_timerid = timeout(fipe_ioat_alloc,
                            NULL, drv_usectohz(FIPE_IOAT_RETRY_INTERVAL));
                }
        }
        mutex_exit(&fipe_ioat_ctrl.ioat_lock);
}

static void
fipe_ioat_free(void)
{
        mutex_enter(&fipe_ioat_ctrl.ioat_lock);
        /* Cancel timeout to avoid race condition. */
        if (fipe_ioat_ctrl.ioat_timerid != 0) {
                fipe_ioat_ctrl.ioat_cancel = B_TRUE;
                mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                (void) untimeout(fipe_ioat_ctrl.ioat_timerid);
                mutex_enter(&fipe_ioat_ctrl.ioat_lock);
                fipe_ioat_ctrl.ioat_timerid = 0;
                fipe_ioat_ctrl.ioat_cancel = B_FALSE;
        }

        if (fipe_ioat_ctrl.ioat_reg_mapped) {
                ddi_regs_map_free(&fipe_ioat_ctrl.ioat_reg_handle);
                fipe_ioat_ctrl.ioat_reg_mapped = B_FALSE;
        }

        fipe_ioat_ctrl.ioat_ready = B_FALSE;
        mutex_exit(&fipe_ioat_ctrl.ioat_lock);
}

#else   /* FIPE_IOAT_BUILTIN */

/*
 * Trigger IOAT memory copy operation when entering power saving state.
 * A group of commands will be posted to IOAT driver and those commands
 * will be placed into an IOAT ring buffer.
 */
static int
fipe_ioat_trigger(void)
{
        int idx;
        dcopy_cmd_t *cmds = fipe_ioat_ctrl.ioat_cmds;

        for (idx = FIPE_IOAT_CMD_NUM; idx > 0; idx--) {
                if (dcopy_cmd_post(cmds[idx]) == DCOPY_SUCCESS) {
                        continue;
                } else {
                        /*
                         * Don't rollback on failure, it doesn't hurt much more
                         * than some small memory copy operations.
                         */
                        FIPE_KSTAT_DETAIL_INC(ioat_start_fail_cnt);
                        return (-1);
                }
        }

        return (0);
}

/*
 * Cancel the memory copy operations posted by fipe_ioat_trigger.
 * It's achieved by posting a new command which will break the ring
 * created by fipe_ioat_trigger. If it fails, the best way to recover
 * is to just let it go. IOAT will recover when posting next command
 * on the same channel.
 */
static void
fipe_ioat_cancel(void)
{
        if (dcopy_cmd_post(fipe_ioat_ctrl.ioat_cmds[0]) != DCOPY_SUCCESS) {
                FIPE_KSTAT_DETAIL_INC(ioat_stop_fail_cnt);
        }
}

/*
 * This function will be called from allocate IOAT resources.
 * Allocation may fail due to following reasons:
 * 1) IOAT driver hasn't been loaded yet. Keep on trying in this case.
 * 2) IOAT resources are temporarily unavailable.  Keep on trying in this case.
 * 3) Other no recoverable reasons. Disable power management function.
 */
/*ARGSUSED*/
static void
fipe_ioat_alloc(void *arg)
{
        int idx, flags, rc = 0;
        uint64_t physaddr;
        boolean_t fatal = B_FALSE;
        dcopy_query_t info;
        dcopy_handle_t handle;
        dcopy_cmd_t cmds[FIPE_IOAT_CMD_NUM + 1];

        mutex_enter(&fipe_ioat_ctrl.ioat_lock);
        /*
         * fipe_ioat_alloc() is called in DEVICE ATTACH context when loaded.
         * In DEVICE ATTACH context, it can't call ddi_walk_devs(), so just
         * schedule a timer and exit.
         */
        if (fipe_ioat_ctrl.ioat_try_alloc == B_FALSE) {
                fipe_ioat_ctrl.ioat_try_alloc = B_TRUE;
                mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                goto out_error;
        }

        /*
         * Check whether device has been initialized or if it encountered
         * some permanent error.
         */
        if (fipe_ioat_ctrl.ioat_ready || fipe_ioat_ctrl.ioat_failed ||
            fipe_ioat_ctrl.ioat_cancel) {
                fipe_ioat_ctrl.ioat_timerid = 0;
                mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                return;
        }

        if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
                /* Find dev_info_t for IOAT engine. */
                ddi_walk_devs(ddi_root_node(), fipe_search_ioat_dev, NULL);
                if (fipe_ioat_ctrl.ioat_dev_info == NULL) {
                        cmn_err(CE_NOTE,
                            "!fipe: no IOAT hardware found, disable pm.");
                        mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                        fatal = B_TRUE;
                        goto out_error;
                }
        }
        mutex_exit(&fipe_ioat_ctrl.ioat_lock);

        /* Check, allocate and initialize IOAT resources with lock released. */
        dcopy_query(&info);
        if (info.dq_version < DCOPY_QUERY_V0) {
                /* Permanent error, give up. */
                cmn_err(CE_WARN, "!fipe: IOAT driver version mismatch.");
                fatal = B_TRUE;
                goto out_error;
        } else if (info.dq_num_channels == 0) {
                /* IOAT driver hasn't been loaded, keep trying. */
                goto out_error;
        }

        /* Allocate IOAT channel. */
        rc = dcopy_alloc(DCOPY_NOSLEEP, &handle);
        if (rc == DCOPY_NORESOURCES) {
                /* Resource temporarily not available, keep trying. */
                goto out_error;
        } else if (rc != DCOPY_SUCCESS) {
                /* Permanent error, give up. */
                cmn_err(CE_WARN, "!fipe: failed to allocate IOAT channel.");
                fatal = B_TRUE;
                goto out_error;
        }

        /*
         * Allocate multiple IOAT commands and organize them into a ring to
         * loop forever. Commands number is determined by IOAT descriptor size
         * and memory interleave pattern.
         * cmd[0] is used break the loop and disable IOAT operation.
         * cmd[1, FIPE_IOAT_CMD_NUM] are grouped into a ring and cmd[1] is the
         * list head.
         */
        bzero(cmds, sizeof (cmds));
        physaddr = fipe_ioat_ctrl.ioat_buf_physaddr;
        for (idx = FIPE_IOAT_CMD_NUM; idx >= 0; idx--) {
                /* Allocate IOAT commands. */
                if (idx == 0 || idx == FIPE_IOAT_CMD_NUM) {
                        flags = DCOPY_NOSLEEP;
                } else {
                        /*
                         * To link commands into a list, the initial value of
                         * cmd need to be set to next cmd on list.
                         */
                        flags = DCOPY_NOSLEEP | DCOPY_ALLOC_LINK;
                        cmds[idx] = cmds[idx + 1];
                }
                rc = dcopy_cmd_alloc(handle, flags, &cmds[idx]);
                if (rc == DCOPY_NORESOURCES) {
                        goto out_freecmd;
                } else if (rc != DCOPY_SUCCESS) {
                        /* Permanent error, give up. */
                        cmn_err(CE_WARN,
                            "!fipe: failed to allocate IOAT command.");
                        fatal = B_TRUE;
                        goto out_freecmd;
                }

                /* Disable src/dst snoop to improve CPU cache efficiency. */
                cmds[idx]->dp_flags = DCOPY_CMD_NOSRCSNP | DCOPY_CMD_NODSTSNP;
                /* Specially handle commands on the list. */
                if (idx != 0) {
                        /* Disable IOAT status. */
                        cmds[idx]->dp_flags |= DCOPY_CMD_NOSTAT;
                        /* Disable waiting for resources. */
                        cmds[idx]->dp_flags |= DCOPY_CMD_NOWAIT;
                        if (idx == 1) {
                                /* The list head, chain command into loop. */
                                cmds[idx]->dp_flags |= DCOPY_CMD_LOOP;
                        } else {
                                /* Queue all other commands except head. */
                                cmds[idx]->dp_flags |= DCOPY_CMD_QUEUE;
                        }
                }
                cmds[idx]->dp_cmd = DCOPY_CMD_COPY;
                cmds[idx]->dp.copy.cc_source = physaddr;
                cmds[idx]->dp.copy.cc_dest = physaddr + FIPE_MC_MEMORY_OFFSET;
                if (idx == 0) {
                        /*
                         * Command 0 is used to cancel memory copy by breaking
                         * the ring created in fipe_ioat_trigger().
                         * For efficiency, use the smallest memory copy size.
                         */
                        cmds[idx]->dp.copy.cc_size = 1;
                } else {
                        cmds[idx]->dp.copy.cc_size = FIPE_MC_MEMORY_SIZE;
                }
        }

        /* Update IOAT control status if it hasn't been initialized yet. */
        mutex_enter(&fipe_ioat_ctrl.ioat_lock);
        if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
                fipe_ioat_ctrl.ioat_handle = handle;
                for (idx = 0; idx <= FIPE_IOAT_CMD_NUM; idx++) {
                        fipe_ioat_ctrl.ioat_cmds[idx] = cmds[idx];
                }
                fipe_ioat_ctrl.ioat_ready = B_TRUE;
                fipe_ioat_ctrl.ioat_failed = B_FALSE;
                fipe_ioat_ctrl.ioat_timerid = 0;
                mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                return;
        }
        mutex_exit(&fipe_ioat_ctrl.ioat_lock);
        /* Initialized by another thread, fall through to free resources. */

out_freecmd:
        if (cmds[0] != NULL) {
                dcopy_cmd_free(&cmds[0]);
        }
        /* Only need to free head, dcopy will free all commands on the list. */
        for (idx = 1; idx <= FIPE_IOAT_CMD_NUM; idx++) {
                if (cmds[idx] != NULL) {
                        dcopy_cmd_free(&cmds[idx]);
                        break;
                }
        }
        dcopy_free(&handle);

out_error:
        mutex_enter(&fipe_ioat_ctrl.ioat_lock);
        fipe_ioat_ctrl.ioat_timerid = 0;
        if (!fipe_ioat_ctrl.ioat_ready && !fipe_ioat_ctrl.ioat_cancel) {
                if (fatal) {
                        /* Mark permanent error and give up. */
                        fipe_ioat_ctrl.ioat_failed = B_TRUE;
                        /* Release reference count hold by ddi_find_devinfo. */
                        if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
                                ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
                                fipe_ioat_ctrl.ioat_dev_info = NULL;
                        }
                } else {
                        /*
                         * Schedule another timer to keep on trying.
                         * timeout() should always success, no need to check.
                         */
                        fipe_ioat_ctrl.ioat_timerid = timeout(fipe_ioat_alloc,
                            NULL, drv_usectohz(FIPE_IOAT_RETRY_INTERVAL));
                }
        }
        mutex_exit(&fipe_ioat_ctrl.ioat_lock);
}

/*
 * Free resources allocated in fipe_ioat_alloc.
 */
static void
fipe_ioat_free(void)
{
        int idx = 0;
        dcopy_cmd_t *cmds = fipe_ioat_ctrl.ioat_cmds;

        mutex_enter(&fipe_ioat_ctrl.ioat_lock);

        /* Cancel timeout to avoid race condition. */
        if (fipe_ioat_ctrl.ioat_timerid != 0) {
                fipe_ioat_ctrl.ioat_cancel = B_TRUE;
                mutex_exit(&fipe_ioat_ctrl.ioat_lock);
                (void) untimeout(fipe_ioat_ctrl.ioat_timerid);
                mutex_enter(&fipe_ioat_ctrl.ioat_lock);
                fipe_ioat_ctrl.ioat_timerid = 0;
                fipe_ioat_ctrl.ioat_cancel = B_FALSE;
        }

        /* Free ioat resources. */
        if (fipe_ioat_ctrl.ioat_ready) {
                if (cmds[0] != NULL) {
                        dcopy_cmd_free(&cmds[0]);
                }
                for (idx = 1; idx <= FIPE_IOAT_CMD_NUM; idx++) {
                        if (cmds[idx] != NULL) {
                                dcopy_cmd_free(&cmds[idx]);
                                break;
                        }
                }
                bzero(fipe_ioat_ctrl.ioat_cmds,
                    sizeof (fipe_ioat_ctrl.ioat_cmds));
                dcopy_free(&fipe_ioat_ctrl.ioat_handle);
                fipe_ioat_ctrl.ioat_handle = NULL;
                fipe_ioat_ctrl.ioat_ready = B_FALSE;
        }

        /* Release reference count hold by ddi_find_devinfo. */
        if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
                ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
                fipe_ioat_ctrl.ioat_dev_info = NULL;
        }

        mutex_exit(&fipe_ioat_ctrl.ioat_lock);
}
#endif  /* FIPE_IOAT_BUILTIN */

/*
 * Initialize IOAT relative resources.
 */
static int
fipe_ioat_init(void)
{
        char *buf;
        size_t size;

        bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl));
        mutex_init(&fipe_ioat_ctrl.ioat_lock, NULL, MUTEX_DRIVER, NULL);

        /*
         * Allocate memory for IOAT memory copy operation.
         * The allocated memory should be page aligned to achieve better power
         * savings.
         * Don't use ddi_dma_mem_alloc here to keep thing simple.  This also
         * makes quiesce easier.
         */
        size = PAGESIZE;
        buf = kmem_zalloc(size, KM_SLEEP);
        if ((intptr_t)buf & PAGEOFFSET) {
                kmem_free(buf, PAGESIZE);
                size <<= 1;
                buf = kmem_zalloc(size, KM_SLEEP);
        }
        fipe_ioat_ctrl.ioat_buf_size = size;
        fipe_ioat_ctrl.ioat_buf_start = buf;
        buf = (char *)P2ROUNDUP((intptr_t)buf, PAGESIZE);
        fipe_ioat_ctrl.ioat_buf_virtaddr = buf;
        fipe_ioat_ctrl.ioat_buf_physaddr = hat_getpfnum(kas.a_hat, buf);
        fipe_ioat_ctrl.ioat_buf_physaddr <<= PAGESHIFT;

#ifdef  FIPE_IOAT_BUILTIN
        {
                uint64_t bufpa;
                /* IOAT descriptor data structure copied from ioat.h. */
                struct fipe_ioat_cmd_desc {
                        uint32_t        dd_size;
                        uint32_t        dd_ctrl;
                        uint64_t        dd_src_paddr;
                        uint64_t        dd_dest_paddr;
                        uint64_t        dd_next_desc;
                        uint64_t        dd_res4;
                        uint64_t        dd_res5;
                        uint64_t        dd_res6;
                        uint64_t        dd_res7;
                } *desc;

                /*
                 * Build two IOAT command descriptors and chain them into ring.
                 * Control flags as below:
                 *      0x2: disable source snoop
                 *      0x4: disable destination snoop
                 *      0x0 << 24: memory copy operation
                 * The layout for command descriptors and memory buffers are
                 * organized for power saving effect, please don't change it.
                 */
                buf = fipe_ioat_ctrl.ioat_buf_virtaddr;
                bufpa = fipe_ioat_ctrl.ioat_buf_physaddr;
                fipe_ioat_ctrl.ioat_cmd_physaddr = bufpa;

                /* First command descriptor. */
                desc = (struct fipe_ioat_cmd_desc *)(buf);
                desc->dd_size = 128;
                desc->dd_ctrl = 0x6;
                desc->dd_src_paddr = bufpa + 2048;
                desc->dd_dest_paddr = bufpa + 3072;
                /* Point to second descriptor. */
                desc->dd_next_desc = bufpa + 64;

                /* Second command descriptor. */
                desc = (struct fipe_ioat_cmd_desc *)(buf + 64);
                desc->dd_size = 128;
                desc->dd_ctrl = 0x6;
                desc->dd_src_paddr = bufpa + 2048;
                desc->dd_dest_paddr = bufpa + 3072;
                /* Point to first descriptor. */
                desc->dd_next_desc = bufpa;
        }
#endif  /* FIPE_IOAT_BUILTIN */

        return (0);
}

static void
fipe_ioat_fini(void)
{
        /* Release reference count hold by ddi_find_devinfo. */
        if (fipe_ioat_ctrl.ioat_dev_info != NULL) {
                ndi_rele_devi(fipe_ioat_ctrl.ioat_dev_info);
                fipe_ioat_ctrl.ioat_dev_info = NULL;
        }

        if (fipe_ioat_ctrl.ioat_buf_start != NULL) {
                ASSERT(fipe_ioat_ctrl.ioat_buf_size != 0);
                kmem_free(fipe_ioat_ctrl.ioat_buf_start,
                    fipe_ioat_ctrl.ioat_buf_size);
        }

        mutex_destroy(&fipe_ioat_ctrl.ioat_lock);
        bzero(&fipe_ioat_ctrl, sizeof (fipe_ioat_ctrl));
}

static int
fipe_idle_start(void)
{
        int rc;

        if (fipe_idle_ctrl.idle_ready) {
                return (0);
        }

        if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_ENTER_TIMESTAMP,
            &fipe_idle_ctrl.prop_enter) != 0) {
                cmn_err(CE_WARN, "!fipe: failed to get enter_ts property.");
                return (-1);
        }
        if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_EXIT_TIMESTAMP,
            &fipe_idle_ctrl.prop_exit) != 0) {
                cmn_err(CE_WARN, "!fipe: failed to get exit_ts property.");
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
                return (-1);
        }
        if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_TOTAL_IDLE_TIME,
            &fipe_idle_ctrl.prop_idle) != 0) {
                cmn_err(CE_WARN, "!fipe: failed to get idle_time property.");
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
                return (-1);
        }
        if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_TOTAL_BUSY_TIME,
            &fipe_idle_ctrl.prop_busy) != 0) {
                cmn_err(CE_WARN, "!fipe: failed to get busy_time property.");
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
                return (-1);
        }
        if (cpu_idle_prop_create_handle(CPU_IDLE_PROP_INTERRUPT_COUNT,
            &fipe_idle_ctrl.prop_intr) != 0) {
                cmn_err(CE_WARN, "!fipe: failed to get intr_count property.");
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
                return (-1);
        }

        /* Register idle state notification callback. */
        rc = cpu_idle_register_callback(CPU_IDLE_CB_PRIO_FIPE, &fipe_idle_cb,
            NULL, &fipe_idle_ctrl.cb_handle);
        if (rc != 0) {
                cmn_err(CE_WARN, "!fipe: failed to register cpuidle callback.");
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_intr);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
                (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);
                return (-1);
        }

        fipe_idle_ctrl.idle_ready = B_TRUE;

        return (0);
}

static int
fipe_idle_stop(void)
{
        int rc;

        if (fipe_idle_ctrl.idle_ready == B_FALSE) {
                return (0);
        }

        rc = cpu_idle_unregister_callback(fipe_idle_ctrl.cb_handle);
        if (rc != 0) {
                cmn_err(CE_WARN,
                    "!fipe: failed to unregister cpuidle callback.");
                return (-1);
        }

        (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_intr);
        (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_busy);
        (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_idle);
        (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_exit);
        (void) cpu_idle_prop_destroy_handle(fipe_idle_ctrl.prop_enter);

        fipe_idle_ctrl.idle_ready = B_FALSE;

        return (0);
}

#ifdef  FIPE_KSTAT_SUPPORT
static int
fipe_kstat_update(kstat_t *ksp, int rw)
{
        struct fipe_kstat_s *sp;
        hrtime_t hrt;

        if (rw == KSTAT_WRITE) {
                return (EACCES);
        }

        sp = ksp->ks_data;
        sp->fipe_enabled.value.i32 = fipe_gbl_ctrl.pm_enabled ? 1 : 0;
        sp->fipe_policy.value.i32 = fipe_pm_policy;

        hrt = fipe_gbl_ctrl.time_in_pm;
        scalehrtime(&hrt);
        sp->fipe_pm_time.value.ui64 = (uint64_t)hrt;

#ifdef  FIPE_KSTAT_DETAIL
        sp->ioat_ready.value.i32 = fipe_ioat_ctrl.ioat_ready ? 1 : 0;
#endif  /* FIPE_KSTAT_DETAIL */

        return (0);
}
#endif  /* FIPE_KSTAT_SUPPORT */

/*
 * Initialize memory power management subsystem.
 * Note: This function should only be called from ATTACH.
 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
 */
int
fipe_init(dev_info_t *dip)
{
        size_t nsize;
        hrtime_t hrt;

        /* Initialize global control structure. */
        bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));
        mutex_init(&fipe_gbl_ctrl.lock, NULL, MUTEX_DRIVER, NULL);

        /* Query power management policy from device property. */
        fipe_pm_policy = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
            FIPE_PROP_PM_POLICY, fipe_pm_policy);
        if (fipe_pm_policy < 0 || fipe_pm_policy >= FIPE_PM_POLICY_MAX) {
                cmn_err(CE_CONT,
                    "?fipe: invalid power management policy %d.\n",
                    fipe_pm_policy);
                fipe_pm_policy = FIPE_PM_POLICY_BALANCE;
        }
        fipe_profile_curr = &fipe_profiles[fipe_pm_policy];

        /*
         * Compute unscaled hrtime value corresponding to FIPE_STAT_INTERVAL.
         * (1 << 36) should be big enough here.
         */
        hrt = 1ULL << 36;
        scalehrtime(&hrt);
        fipe_idle_ctrl.tick_interval = FIPE_STAT_INTERVAL * (1ULL << 36) / hrt;

        if (fipe_mc_init(dip) != 0) {
                cmn_err(CE_WARN, "!fipe: failed to initialize mc state.");
                goto out_mc_error;
        }
        if (fipe_ioat_init() != 0) {
                cmn_err(CE_NOTE, "!fipe: failed to initialize ioat state.");
                goto out_ioat_error;
        }

        /* Allocate per-CPU structure. */
        nsize = max_ncpus * sizeof (fipe_cpu_state_t);
        nsize += CPU_CACHE_COHERENCE_SIZE;
        fipe_gbl_ctrl.state_buf = kmem_zalloc(nsize, KM_SLEEP);
        fipe_gbl_ctrl.state_size = nsize;
        fipe_cpu_states = (fipe_cpu_state_t *)P2ROUNDUP(
            (intptr_t)fipe_gbl_ctrl.state_buf, CPU_CACHE_COHERENCE_SIZE);

#ifdef  FIPE_KSTAT_SUPPORT
        fipe_gbl_ctrl.fipe_kstat = kstat_create("fipe", 0, "fipe-pm", "misc",
            KSTAT_TYPE_NAMED, sizeof (fipe_kstat) / sizeof (kstat_named_t),
            KSTAT_FLAG_VIRTUAL);
        if (fipe_gbl_ctrl.fipe_kstat == NULL) {
                cmn_err(CE_CONT, "?fipe: failed to create kstat object.\n");
        } else {
                fipe_gbl_ctrl.fipe_kstat->ks_lock = &fipe_gbl_ctrl.lock;
                fipe_gbl_ctrl.fipe_kstat->ks_data = &fipe_kstat;
                fipe_gbl_ctrl.fipe_kstat->ks_update = fipe_kstat_update;
                kstat_install(fipe_gbl_ctrl.fipe_kstat);
        }
#endif  /* FIPE_KSTAT_SUPPORT */

        return (0);

out_ioat_error:
        fipe_mc_fini();
out_mc_error:
        mutex_destroy(&fipe_gbl_ctrl.lock);
        bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));

        return (-1);
}

/*
 * Destroy memory power management subsystem.
 * Note: This function should only be called from DETACH.
 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
 */
int
fipe_fini(void)
{
        if (fipe_gbl_ctrl.pm_enabled) {
                cmn_err(CE_NOTE, "!fipe: call fipe_fini without stopping PM.");
                return (EBUSY);
        }

        ASSERT(!fipe_gbl_ctrl.pm_active);
        fipe_ioat_fini();
        fipe_mc_fini();

#ifdef  FIPE_KSTAT_SUPPORT
        if (fipe_gbl_ctrl.fipe_kstat != NULL) {
                kstat_delete(fipe_gbl_ctrl.fipe_kstat);
                fipe_gbl_ctrl.fipe_kstat = NULL;
        }
#endif  /* FIPE_KSTAT_SUPPORT */

        if (fipe_gbl_ctrl.state_buf != NULL) {
                ASSERT(fipe_gbl_ctrl.state_size != 0);
                kmem_free(fipe_gbl_ctrl.state_buf, fipe_gbl_ctrl.state_size);
                fipe_cpu_states = NULL;
        }

        fipe_profile_curr = NULL;
        mutex_destroy(&fipe_gbl_ctrl.lock);
        bzero(&fipe_gbl_ctrl, sizeof (fipe_gbl_ctrl));

        return (0);
}

/*
 * Start memory power management subsystem.
 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
 */
int
fipe_start(void)
{
        if (fipe_gbl_ctrl.pm_enabled == B_TRUE) {
                return (0);
        }

        bzero(fipe_cpu_states, max_ncpus * sizeof (fipe_cpu_states[0]));
        fipe_ioat_alloc(NULL);
        if (fipe_idle_start() != 0) {
                cmn_err(CE_NOTE, "!fipe: failed to start PM subsystem.");
                fipe_ioat_free();
                return (-1);
        }

        fipe_gbl_ctrl.pm_enabled = B_TRUE;

        return (0);
}

/*
 * Stop memory power management subsystem.
 * Note: caller must ensure exclusive access to all fipe_xxx interfaces.
 */
int
fipe_stop(void)
{
        if (fipe_gbl_ctrl.pm_enabled) {
                if (fipe_idle_stop() != 0) {
                        cmn_err(CE_NOTE,
                            "!fipe: failed to stop PM subsystem.");
                        return (-1);
                }
                fipe_ioat_free();
                fipe_gbl_ctrl.pm_enabled = B_FALSE;
        }
        ASSERT(!fipe_gbl_ctrl.pm_active);

        return (0);
}

int
fipe_suspend(void)
{
        /* Save current power management policy. */
        fipe_pm_policy_saved = fipe_pm_policy;
        /* Disable PM by setting profile to FIPE_PM_POLICY_DISABLE. */
        fipe_pm_policy = FIPE_PM_POLICY_DISABLE;
        fipe_profile_curr = &fipe_profiles[fipe_pm_policy];

        return (0);
}

int
fipe_resume(void)
{
        /* Restore saved power management policy. */
        fipe_pm_policy = fipe_pm_policy_saved;
        fipe_profile_curr = &fipe_profiles[fipe_pm_policy];

        return (0);
}

fipe_pm_policy_t
fipe_get_pmpolicy(void)
{
        return (fipe_pm_policy);
}

int
fipe_set_pmpolicy(fipe_pm_policy_t policy)
{
        if (policy < 0 || policy >= FIPE_PM_POLICY_MAX) {
                return (EINVAL);
        }
        fipe_pm_policy = policy;
        fipe_profile_curr = &fipe_profiles[fipe_pm_policy];

        return (0);
}

/*
 * Check condition (fipe_gbl_ctrl.cpu_cnt == ncpus) to make sure that
 * there is other CPU trying to wake up system from memory power saving state.
 * If a CPU is waking up system, fipe_disable() will set
 * fipe_gbl_ctrl.pm_active to false as soon as possible and allow other CPU's
 * to continue, and it will take the responsibility to recover system from
 * memory power saving state.
 */
static void
fipe_enable(int throttle, cpu_idle_check_wakeup_t check_func, void* check_arg)
{
        extern void membar_sync(void);

        FIPE_KSTAT_DETAIL_INC(pm_tryenter_cnt);

        /*
         * Check CPU wakeup events.
         */
        if (check_func != NULL) {
                (*check_func)(check_arg);
        }

        /*
         * Try to acquire mutex, which also implicitly has the same effect
         * of calling membar_sync().
         * If mutex_tryenter fails, that means other CPU is waking up.
         */
        if (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
                FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
        /*
         * Handle a special race condition for the case that a CPU wakes
         * and then enters into idle state within a short period.
         * This case can't be reliably detected by cpu_count mechanism.
         */
        } else if (fipe_gbl_ctrl.pm_active) {
                FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
                mutex_exit(&fipe_gbl_ctrl.lock);
        } else {
                fipe_gbl_ctrl.pm_active = B_TRUE;
                membar_sync();
                if (fipe_gbl_ctrl.cpu_count != ncpus) {
                        FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
                        fipe_gbl_ctrl.pm_active = B_FALSE;
                } else if (fipe_ioat_trigger() != 0) {
                        fipe_gbl_ctrl.pm_active = B_FALSE;
                } else if (fipe_gbl_ctrl.cpu_count != ncpus ||
                    fipe_mc_change(throttle) != 0) {
                        fipe_gbl_ctrl.pm_active = B_FALSE;
                        fipe_ioat_cancel();
                        if (fipe_gbl_ctrl.cpu_count != ncpus) {
                                FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
                        }
                } else if (fipe_gbl_ctrl.cpu_count != ncpus) {
                        fipe_gbl_ctrl.pm_active = B_FALSE;
                        fipe_mc_restore();
                        fipe_ioat_cancel();
                        FIPE_KSTAT_DETAIL_INC(pm_race_cnt);
                } else {
                        FIPE_KSTAT_DETAIL_INC(pm_success_cnt);
                }
                mutex_exit(&fipe_gbl_ctrl.lock);
        }
}

static void
fipe_disable(void)
{
        /*
         * Try to acquire lock, which also implicitly has the same effect
         * of calling membar_sync().
         */
        while (mutex_tryenter(&fipe_gbl_ctrl.lock) == 0) {
                /*
                 * If power saving is inactive, just return and all dirty
                 * house-keeping work will be handled in fipe_enable().
                 */
                if (fipe_gbl_ctrl.pm_active == B_FALSE) {
                        return;
                } else {
                        (void) SMT_PAUSE();
                }
        }

        /* Disable power saving if it's active. */
        if (fipe_gbl_ctrl.pm_active) {
                /*
                 * Set pm_active to FALSE as soon as possible to prevent
                 * other CPUs from waiting on pm_active flag.
                 */
                fipe_gbl_ctrl.pm_active = B_FALSE;
                membar_producer();
                fipe_mc_restore();
                fipe_ioat_cancel();
        }

        mutex_exit(&fipe_gbl_ctrl.lock);
}

/*ARGSUSED*/
static boolean_t
fipe_check_cpu(struct fipe_cpu_state *sp, cpu_idle_callback_context_t ctx,
    hrtime_t ts)
{
        if (cpu_flagged_offline(CPU->cpu_flags)) {
                /* Treat CPU in offline state as ready. */
                sp->cond_ready = B_TRUE;
                return (B_TRUE);
        } else if (sp->next_ts <= ts) {
                uint64_t intr;
                hrtime_t idle, busy, diff;
                cpu_idle_prop_value_t val;

                /* Set default value. */
                sp->cond_ready = B_TRUE;
                sp->idle_count = 0;

                /* Calculate idle percent. */
                idle = sp->last_idle;
                sp->last_idle = cpu_idle_prop_get_hrtime(
                    fipe_idle_ctrl.prop_idle, ctx);
                idle = sp->last_idle - idle;
                busy = sp->last_busy;
                sp->last_busy = cpu_idle_prop_get_hrtime(
                    fipe_idle_ctrl.prop_busy, ctx);
                busy = sp->last_busy - busy;
                /* Check idle condition. */
                if (idle > 0 && busy > 0) {
                        if (busy * (100 - FIPE_PROF_BUSY_THRESHOLD) >
                            idle * FIPE_PROF_BUSY_THRESHOLD) {
                                FIPE_KSTAT_DETAIL_INC(cpu_busy_cnt);
                                sp->cond_ready = B_FALSE;
                        } else {
                                FIPE_KSTAT_DETAIL_INC(cpu_idle_cnt);
                        }
                } else {
                        FIPE_KSTAT_DETAIL_INC(cpu_busy_cnt);
                        sp->cond_ready = B_FALSE;
                }

                /* Calculate interrupt count. */
                diff = sp->next_ts;
                sp->next_ts = ts + fipe_idle_ctrl.tick_interval;
                diff = sp->next_ts - diff;
                intr = sp->last_intr;
                if (cpu_idle_prop_get_value(fipe_idle_ctrl.prop_intr, ctx,
                    &val) == 0) {
                        sp->last_intr = val.cipv_uint64;
                        intr = sp->last_intr - intr;
                        if (diff != 0) {
                                intr = intr * fipe_idle_ctrl.tick_interval;
                                intr /= diff;
                        } else {
                                intr = FIPE_PROF_INTR_THRESHOLD;
                        }
                } else {
                        intr = FIPE_PROF_INTR_THRESHOLD;
                }

                /*
                 * System is busy with interrupts, so disable all PM
                 * status checks for INTR_BUSY_THROTTLE ticks.
                 * Interrupts are disabled when FIPE callbacks are called,
                 * so this optimization will help to reduce interrupt
                 * latency.
                 */
                if (intr >= FIPE_PROF_INTR_BUSY_THRESHOLD) {
                        FIPE_KSTAT_DETAIL_INC(cpu_intr_busy_cnt);
                        sp->throttle_ts = ts + FIPE_PROF_INTR_BUSY_THROTTLE *
                            fipe_idle_ctrl.tick_interval;
                        sp->cond_ready = B_FALSE;
                } else if (intr >= FIPE_PROF_INTR_THRESHOLD) {
                        FIPE_KSTAT_DETAIL_INC(cpu_intr_throttle_cnt);
                        sp->cond_ready = B_FALSE;
                }
        } else if (++sp->idle_count >= FIPE_PROF_IDLE_COUNT) {
                /* Too many idle enter/exit in this tick. */
                FIPE_KSTAT_DETAIL_INC(cpu_loop_cnt);
                sp->throttle_ts = sp->next_ts + fipe_idle_ctrl.tick_interval;
                sp->idle_count = 0;
                sp->cond_ready = B_FALSE;
                return (B_FALSE);
        }

        return (sp->cond_ready);
}

/*ARGSUSED*/
static void
fipe_idle_enter(void *arg, cpu_idle_callback_context_t ctx,
    cpu_idle_check_wakeup_t check_func, void* check_arg)
{
        hrtime_t ts;
        uint32_t cnt;
        uint64_t iowait;
        cpu_t *cp = CPU;
        struct fipe_cpu_state *sp;

        sp = &fipe_cpu_states[cp->cpu_id];
        ts = cpu_idle_prop_get_hrtime(fipe_idle_ctrl.prop_enter, ctx);

        if (fipe_pm_policy != FIPE_PM_POLICY_DISABLE &&
            fipe_ioat_ctrl.ioat_ready &&
            sp->state_ready && sp->throttle_ts <= ts) {
                /* Adjust iowait count for local CPU. */
                iowait = CPU_STATS(cp, sys.iowait);
                if (iowait != sp->last_iowait) {
                        atomic_add_64(&fipe_gbl_ctrl.io_waiters,
                            iowait - sp->last_iowait);
                        sp->last_iowait = iowait;
                }

                /* Check current CPU status. */
                if (fipe_check_cpu(sp, ctx, ts)) {
                        /* Increase count of CPU ready for power saving. */
                        do {
                                cnt = fipe_gbl_ctrl.cpu_count;
                                ASSERT(cnt < ncpus);
                        } while (atomic_cas_32(&fipe_gbl_ctrl.cpu_count,
                            cnt, cnt + 1) != cnt);

                        /*
                         * Enable power saving if all CPUs are idle.
                         */
                        if (cnt + 1 == ncpus) {
                                if (fipe_gbl_ctrl.io_waiters == 0) {
                                        fipe_gbl_ctrl.enter_ts = ts;
                                        fipe_enable(fipe_pm_throttle_level,
                                            check_func, check_arg);
                                /* There are ongoing block io operations. */
                                } else {
                                        FIPE_KSTAT_DETAIL_INC(bio_busy_cnt);
                                }
                        }
                }
        } else if (fipe_pm_policy == FIPE_PM_POLICY_DISABLE ||
            fipe_ioat_ctrl.ioat_ready == B_FALSE) {
                if (sp->cond_ready == B_TRUE) {
                        sp->cond_ready = B_FALSE;
                }
        } else if (sp->state_ready == B_FALSE) {
                sp->cond_ready = B_FALSE;
                sp->state_ready = B_TRUE;
                sp->throttle_ts = 0;
                sp->next_ts = ts + fipe_idle_ctrl.tick_interval;
                sp->last_busy = cpu_idle_prop_get_hrtime(
                    fipe_idle_ctrl.prop_busy, ctx);
                sp->last_idle = cpu_idle_prop_get_hrtime(
                    fipe_idle_ctrl.prop_idle, ctx);
                sp->last_intr = cpu_idle_prop_get_hrtime(
                    fipe_idle_ctrl.prop_intr, ctx);
                sp->idle_count = 0;
        }
}

/*ARGSUSED*/
static void
fipe_idle_exit(void* arg, cpu_idle_callback_context_t ctx, int flags)
{
        uint32_t cnt;
        hrtime_t ts;
        struct fipe_cpu_state *sp;

        sp = &fipe_cpu_states[CPU->cpu_id];
        if (sp->cond_ready) {
                do {
                        cnt = fipe_gbl_ctrl.cpu_count;
                        ASSERT(cnt > 0);
                } while (atomic_cas_32(&fipe_gbl_ctrl.cpu_count,
                    cnt, cnt - 1) != cnt);

                /*
                 * Try to disable power saving state.
                 * Only the first CPU waking from idle state will try to
                 * disable power saving state, all other CPUs will just go
                 * on and not try to wait for memory to recover from power
                 * saving state.
                 * So there are possible periods during which some CPUs are in
                 * active state but memory is in power saving state.
                 * This is OK, since it is an uncommon case, and it is
                 * better for performance to let them continue as their
                 * blocking latency is smaller than a mutex, and is only
                 * hit in the uncommon condition.
                 */
                if (cnt == ncpus) {
                        fipe_disable();
                        ts = cpu_idle_prop_get_hrtime(fipe_idle_ctrl.prop_exit,
                            ctx);
                        fipe_gbl_ctrl.time_in_pm += ts - fipe_gbl_ctrl.enter_ts;
                }
        }
}