root/usr/src/uts/common/io/pciex/pcie_pwr.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2019 Joyent, Inc.
 */

#include <sys/types.h>
#include <sys/ddi.h>
#include <sys/kmem.h>
#include <sys/sysmacros.h>
#include <sys/sunddi.h>
#include <sys/sunpm.h>
#include <sys/epm.h>
#include <sys/sunndi.h>
#include <sys/ddi_impldefs.h>
#include <sys/ddi_implfuncs.h>
#include <sys/pcie.h>
#include <sys/pcie_impl.h>
#include <sys/promif.h>         /* prom_printf */
#include <sys/pcie_pwr.h>

/*
 * This file implements the power management functionality for
 * pci express switch and pci express-to-pci/pci-x bridge. All the
 * code in this file is generic and is not specific to a particular chip.
 * The algorithm, which decides when to go to a lower power is explained
 * below:
 *
 *      1. Initially when no children are attached, the driver is idle from
 *      PM framework point of view ( PM idle/PM busy).
 *
 *      2. Driver is PM busy if either a reference count called pwr_hold is
 *      greater than zero or driver is already at the lowest possible power
 *      level. The lowest possible power level for the driver is equal to the
 *      highest power level among its children. The PM busy condition is
 *      indicated by PCIE_PM_BUSY bit. At any point, only one pm_busy_component
 *      call is made for a nexus driver instance.
 *
 *      3. Driver is PM idle if the pwr_hold is zero and the lowest
 *      possible power level is less than the driver's current power level.
 *      At any point, only one pm_idle_component call is made for a nexus
 *      driver instance.
 *
 *      4. For any events like child attach, it increments pwr_hold and marks
 *      itslef busy, if it is not already done so. This temporary hold is
 *      removed when the event is complete.
 *
 *      5. Any child's power change requires the parent (this driver) to be
 *      full power. So it raises its power and increments pwr_hold. It also
 *      marks itself temporarily busy, if it is not already done. This hold
 *      is removed when the child power change is complete.
 *
 *      6. After each child power change, it evaluates what is the lowest
 *      possible power level. If the lowest possible power level is less than
 *      the current power level and pwr_hold is zero, then it marks itself
 *      idle. The lowest power level is equal or greater than the highest level
 *      among the children. It keeps track of children's power level by
 *      using counters.
 *
 *      7. Any code e.g., which is accessing the driver's own registers should
 *      place a temporary hold using pcie_pm_hold.
 */

static int pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new);
static void pwr_update_counters(int *countersp, int olevel, int nlevel);
static int pwr_level_allowed(pcie_pwr_t *pwr_p);
static void pcie_add_comps(dev_info_t *dip, dev_info_t *cdip,
    pcie_pwr_t *pwr_p);
static void pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip,
    pcie_pwr_t *pwr_p);
static void pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p);
static boolean_t pcie_is_pcie(dev_info_t *dip);
#ifdef DEBUG
static char *pcie_decode_pwr_op(pm_bus_power_op_t op);
#else
#define pcie_decode_pwr_op
#endif

/*
 * power entry point.
 *
 * This function decides whether the PM request is honorable.
 * If yes, it then does what's necessary for switch or
 *    bridge to change its power.
 */
/* ARGSUSED */
int
pcie_power(dev_info_t *dip, int component, int level)
{
        pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
        int *counters = pwr_p->pwr_counters;
        int pmcaps = pwr_p->pwr_pmcaps;
        int ret = DDI_FAILURE;

#if defined(__x86)
        if (dip)
                return (DDI_SUCCESS);
#endif /* defined(__x86) */

        ASSERT(level != PM_LEVEL_UNKNOWN);
        /* PM should not asking for a level, which is unsupported */
        ASSERT(level == PM_LEVEL_D0 || level == PM_LEVEL_D3 ||
            (level == PM_LEVEL_D1 && (pmcaps & PCIE_SUPPORTS_D1)) ||
            (level == PM_LEVEL_D2 && (pmcaps & PCIE_SUPPORTS_D2)));

        mutex_enter(&pwr_p->pwr_lock);
        PCIE_DBG("%s(%d): pcie_power: change from %d to %d\n",
            ddi_driver_name(dip), ddi_get_instance(dip), pwr_p->pwr_func_lvl,
            level);
        if (pwr_p->pwr_func_lvl == level) {
                PCIE_DBG("%s(%d): pcie_power: already at %d\n",
                    ddi_driver_name(dip), ddi_get_instance(dip), level);
                ret = DDI_SUCCESS;
                goto pcie_pwr_done;
        }

        if (level < pwr_p->pwr_func_lvl) {
                /*
                 * Going to lower power. Reject this if we are either busy
                 * or there is a hold.
                 */
                if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
                        PCIE_DBG("%s(%d): pcie_power: rejecting change to %d "
                            "as busy\n", ddi_driver_name(dip),
                            ddi_get_instance(dip), level);
                        goto pcie_pwr_done;
                }

                /*
                 * Now we know that we are neither busy nor there is a hold.
                 * At this point none of the children should be at full power.
                 * Reject the request if level reqested is lower than the level
                 * possible.
                 */
                ASSERT(!counters[PCIE_D0_INDEX] &&
                    !counters[PCIE_UNKNOWN_INDEX]);
                if (level < pwr_level_allowed(pwr_p)) {
                        PCIE_DBG("%s(%d): pcie_power: rejecting level %d as"
                            " %d is the lowest possible\n",
                            ddi_driver_name(dip), ddi_get_instance(dip), level,
                            pwr_level_allowed(pwr_p));
                        goto pcie_pwr_done;
                }
        }

        if (pcie_pwr_change(dip, pwr_p, level) != DDI_SUCCESS) {
                PCIE_DBG("%s(%d): pcie_power: attempt to change to %d "
                    " failed \n", ddi_driver_name(dip), ddi_get_instance(dip),
                    level);
                goto pcie_pwr_done;
        }
        pwr_p->pwr_func_lvl = level;
        PCIE_DBG("%s(%d): pcie_power: level changed to %d \n",
            ddi_driver_name(dip), ddi_get_instance(dip), level);
        ret = DDI_SUCCESS;

pcie_pwr_done:
        mutex_exit(&pwr_p->pwr_lock);
        return (ret);
}

/*
 * Called by pcie_power() only. Caller holds the pwr_lock.
 *
 * dip - dev_info pointer
 * pwr_p - pm info for the node.
 * new     - new level
 */
static int
pcie_pwr_change(dev_info_t *dip, pcie_pwr_t *pwr_p, int new)
{
        uint16_t pmcsr;

        ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
        ASSERT(new != pwr_p->pwr_func_lvl);
        pmcsr = pci_config_get16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset);
        pmcsr &= ~PCI_PMCSR_STATE_MASK;
        switch (new) {
        case PM_LEVEL_D0:
                pmcsr |= PCI_PMCSR_D0;
                break;

        case PM_LEVEL_D1:
                pmcsr |= PCI_PMCSR_D1;
                break;

        case PM_LEVEL_D2:
                pmcsr |= PCI_PMCSR_D2;
                break;

        case PM_LEVEL_D3:
                pmcsr |= PCI_PMCSR_D3HOT;
                break;

        default:
                ASSERT(0);
                break;
        }
        /* Save config space, if going to D3 */
        if (new == PM_LEVEL_D3) {
                PCIE_DBG("%s(%d): pwr_change: saving config space regs\n",
                    ddi_driver_name(dip), ddi_get_instance(dip));
                if (pci_save_config_regs(dip) != DDI_SUCCESS) {
                        PCIE_DBG("%s(%d): pcie_pwr_change: failed to save "
                            "config space regs\n", ddi_driver_name(dip),
                            ddi_get_instance(dip));
                        return (DDI_FAILURE);
                }
        }

        pci_config_put16(pwr_p->pwr_conf_hdl, pwr_p->pwr_pmcsr_offset, pmcsr);

        /*
         * TBD: Taken from pci_pci driver. Is this required?
         * No bus transactions should occur without waiting for
         * settle time specified in PCI PM spec rev 2.1 sec 5.6.1
         * To make things simple, just use the max time specified for
         * all state transitions.
         */
        delay(drv_usectohz(PCI_CLK_SETTLE_TIME));

        /*
         * Restore config space if coming out of D3
         */
        if (pwr_p->pwr_func_lvl == PM_LEVEL_D3) {
                PCIE_DBG("%s(%d): pcie_pwr_change: restoring config space\n",
                    ddi_driver_name(dip), ddi_get_instance(dip));
                if (pci_restore_config_regs(dip) != DDI_SUCCESS) {
                        PCIE_DBG("%s(%d): pcie_pwr_change: failed to restore "
                            "config space regs\n", ddi_driver_name(dip),
                            ddi_get_instance(dip));
                        return (DDI_FAILURE);
                }
        }
        return (DDI_SUCCESS);
}

/*
 * bus_ctlops.bus_power function.
 *
 * This function handles PRE_ POST_ change notifications, sent by
 * PM framework related to child's power level change. It marks itself
 * idle or busy based on the children's power level.
 */
int
pcie_bus_power(dev_info_t *dip, void *impl_arg, pm_bus_power_op_t op,
    void *arg, void *result)
{
        pcie_pwr_t *pwr_p = PCIE_NEXUS_PMINFO(dip);
        int *counters = pwr_p->pwr_counters; /* nexus counters */
        int *child_counters; /* per child dip counters */
        pm_bp_child_pwrchg_t *bpc;
        pm_bp_has_changed_t *bphc;
        dev_info_t *cdip;
        int new_level;
        int old_level;
        int rv = DDI_SUCCESS;
        int level_allowed, comp;

#if defined(__x86)
        if (dip)
                return (DDI_SUCCESS);
#endif /* defined(__x86) */

        switch (op) {
        case BUS_POWER_PRE_NOTIFICATION:
        case BUS_POWER_POST_NOTIFICATION:
                bpc = (pm_bp_child_pwrchg_t *)arg;
                cdip = bpc->bpc_dip;
                new_level = bpc->bpc_nlevel;
                old_level = bpc->bpc_olevel;
                comp = bpc->bpc_comp;
                break;

        case BUS_POWER_HAS_CHANGED:
                bphc = (pm_bp_has_changed_t *)arg;
                cdip = bphc->bphc_dip;
                new_level = bphc->bphc_nlevel;
                old_level = bphc->bphc_olevel;
                comp = bphc->bphc_comp;
                break;

        default:
                break;

        }

        ASSERT(pwr_p);
        mutex_enter(&pwr_p->pwr_lock);
        switch (op) {
        case BUS_POWER_PRE_NOTIFICATION:
                PCIE_DBG("%s(%d): pcie_bus_power: %s@%d op %s %d->%d\n",
                    ddi_driver_name(dip), ddi_get_instance(dip),
                    ddi_driver_name(cdip), ddi_get_instance(cdip),
                    pcie_decode_pwr_op(op), old_level, new_level);
                /*
                 * If the nexus doesn't want the child to go into
                 * non-D0 state, mark the child busy. This way PM
                 * framework will never try to lower the child's power.
                 * In case of pm_lower_power, marking busy won't help.
                 * So we need to specifically reject the attempt to
                 * go to non-D0 state.
                 */
                if (pwr_p->pwr_flags & PCIE_NO_CHILD_PM) {
                        if (!PCIE_IS_COMPS_COUNTED(cdip)) {
                                PCIE_DBG("%s(%d): pcie_bus_power: marking "
                                    "child busy to disable pm \n",
                                    ddi_driver_name(dip),
                                    ddi_get_instance(dip));
                                (void) pm_busy_component(cdip, 0);
                        }
                        if (new_level < PM_LEVEL_D0 && !comp) {
                                PCIE_DBG("%s(%d): pcie_bus_power: rejecting "
                                    "child's attempt to go to %d\n",
                                    ddi_driver_name(dip), ddi_get_instance(dip),
                                    new_level);
                                rv = DDI_FAILURE;
                        }
                }
                mutex_exit(&pwr_p->pwr_lock);
                if (rv == DDI_SUCCESS)
                        rv = pcie_pm_hold(dip);
                return (rv);

        case BUS_POWER_HAS_CHANGED:
        case BUS_POWER_POST_NOTIFICATION:
                PCIE_DBG("%s(%d): pcie_bus_power: %s@%d op %s %d->%d\n",
                    ddi_driver_name(dip), ddi_get_instance(dip),
                    ddi_driver_name(cdip), ddi_get_instance(cdip),
                    pcie_decode_pwr_op(op), old_level, new_level);
                /*
                 * Child device power changed
                 * If pm components of this child aren't accounted for
                 * then add the components to the counters. This can't
                 * be done in POST_ATTACH ctlop as pm info isn't created
                 * by then. Also because a driver can make a pm call during
                 * the attach.
                 */
                if (!PCIE_IS_COMPS_COUNTED(cdip)) {
                        (void) pcie_pm_add_child(dip, cdip);
                        if ((pwr_p->pwr_flags & PCIE_NO_CHILD_PM) &&
                            (op == BUS_POWER_HAS_CHANGED)) {
                                PCIE_DBG("%s(%d): pcie_bus_power: marking "
                                    "child busy to disable pm \n",
                                    ddi_driver_name(dip),
                                    ddi_get_instance(dip));
                                (void) pm_busy_component(cdip, 0);
                                /*
                                 * If the driver has already changed to lower
                                 * power(pm_power_has_changed) on its own,
                                 * there is nothing we can do other than
                                 * logging the warning message on the console.
                                 */
                                if (new_level < PM_LEVEL_D0)
                                        cmn_err(CE_WARN, "!Downstream device "
                                            "%s@%d went to non-D0 state: "
                                            "possible loss of link\n",
                                            ddi_driver_name(cdip),
                                            ddi_get_instance(cdip));
                        }
                }


                /*
                 * If it is POST and device PM is supported, release the
                 * hold done in PRE.
                 */
                if (op == BUS_POWER_POST_NOTIFICATION &&
                    PCIE_SUPPORTS_DEVICE_PM(dip)) {
                        pcie_pm_subrelease(dip, pwr_p);
                }

                if (*((int *)result) == DDI_FAILURE) {
                        PCIE_DBG("%s(%d): pcie_bus_power: change for %s%d "
                            "failed\n", ddi_driver_name(dip),
                            ddi_get_instance(dip), ddi_driver_name(cdip),
                            ddi_get_instance(cdip));
                        break;
                }
                /* Modify counters appropriately */
                pwr_update_counters(counters, old_level, new_level);

                child_counters = PCIE_CHILD_COUNTERS(cdip);
                pwr_update_counters(child_counters, old_level, new_level);

                /* If no device PM, return */
                if (!PCIE_SUPPORTS_DEVICE_PM(dip))
                        break;

                level_allowed = pwr_level_allowed(pwr_p);
                /*
                 * Check conditions for marking busy
                 * Check the flag to set this busy only once for multiple
                 * busy conditions. Mark busy if our current lowest possible
                 * is equal or greater to the current level.
                 */
                if (level_allowed >= pwr_p->pwr_func_lvl &&
                    !(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
                        PCIE_DBG("%s(%d): pcie_bus_power: marking busy\n",
                            ddi_driver_name(dip), ddi_get_instance(dip));
                        (void) pm_busy_component(dip, 0);
                        pwr_p->pwr_flags |= PCIE_PM_BUSY;
                        break;
                }
                /*
                 * Check conditions for marking idle.
                 * If our lowest possible level is less than our current
                 * level mark idle. Mark idle only if it is not already done.
                 */
                if ((level_allowed < pwr_p->pwr_func_lvl) &&
                    (pwr_p->pwr_hold == 0) &&
                    (pwr_p->pwr_flags & PCIE_PM_BUSY)) {
                        /*
                         * For pci express, we should check here whether
                         * the link is in L1 state or not.
                         */
                        PCIE_DBG("%s(%d): pcie_bus_power: marking idle\n",
                            ddi_driver_name(dip), ddi_get_instance(dip));
                        (void) pm_idle_component(dip, 0);
                        pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
                        break;
                }
                break;

        default:
                mutex_exit(&pwr_p->pwr_lock);
                return (pm_busop_bus_power(dip, impl_arg, op, arg, result));
        }
        mutex_exit(&pwr_p->pwr_lock);
        return (rv);
}

/*
 * Decrement the count of children at olevel by one and increment
 * count of children at nlevel by one.
 */
static void
pwr_update_counters(int *countersp, int olevel, int nlevel)
{
        uint32_t        index;

        ASSERT(olevel >= PM_LEVEL_UNKNOWN && olevel <= PM_LEVEL_D0);
        ASSERT(nlevel >= PM_LEVEL_UNKNOWN && nlevel <= PM_LEVEL_D0);

        index = (olevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : olevel);
        countersp[index]--;
        index = (nlevel == PM_LEVEL_UNKNOWN ? PCIE_UNKNOWN_INDEX : nlevel);
        countersp[index]++;
}

/*
 * Returns the lowest possible power level allowed for nexus
 * based on children's power level. Lowest possible level is
 * equal to the highest level among the children. It also checks
 * for the supported level
 * UNKNOWN = D0 > D1 > D2 > D3
 */
static int
pwr_level_allowed(pcie_pwr_t *pwr_p)
{
        int *counters = pwr_p->pwr_counters;
        int i, j;

        ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
        /*
         * Search from UNKNOWN to D2. unknown is same as D0.
         * find the highest level among the children. If that
         * level is supported, return that level. If not,
         * find the next higher supported level and return that
         * level. For example, if the D1 is the highest among
         * children and if D1 isn't supported return D0 as the
         * lowest possible level. We don't need to look at D3
         * as that is the default lowest level and it is always
         * supported.
         */
        for (i = PCIE_UNKNOWN_INDEX; i > 0; i--) {
                if (counters[i]) {
                        if (i == PCIE_UNKNOWN_INDEX)
                                return (PM_LEVEL_D0);
                        /*
                         * i is the highest level among children. If this is
                         * supported, return i.
                         */
                        if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, i))
                                return (i);
                        /* find the next higher supported level */
                        for (j = i + 1; j <= PCIE_D0_INDEX; j++) {
                                if (PCIE_LEVEL_SUPPORTED(pwr_p->pwr_pmcaps, j))
                                        return (j);
                        }
                }
        }

        return (PM_LEVEL_D3);
}

/*
 * Update the counters with number pm components of the child
 * all components are assumed to be at UNKNOWN level.
 */
static void
pcie_add_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
{
        int comps = PM_NUMCMPTS(cdip);
        pcie_pm_t *pcie_pm_p;
        pcie_pwr_child_t *cpwr_p;

        ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
        if (!comps)
                return;

        PCIE_DBG("%s(%d): pcie_add_comps: unknown level counter incremented "
            "from %d by %d because of %s@%d\n",
            ddi_driver_name(dip), ddi_get_instance(dip),
            (pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX], comps,
            ddi_driver_name(cdip), ddi_get_instance(cdip));
        (pwr_p->pwr_counters)[PCIE_UNKNOWN_INDEX] += comps;
        /*
         * Allocate counters per child. This is a part of pcie
         * pm info. If there is no pcie pm info, allocate it here.
         * pcie pm info might already be there for pci express nexus
         * driver e.g. pcieb. For all leaf nodes, it is allocated here.
         */
        if ((pcie_pm_p = PCIE_PMINFO(cdip)) == NULL) {
                pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
                    sizeof (pcie_pm_t), KM_SLEEP);
                PCIE_SET_PMINFO(cdip, pcie_pm_p);
        }
        cpwr_p = (pcie_pwr_child_t *)kmem_zalloc(sizeof (pcie_pwr_child_t),
            KM_SLEEP);
        pcie_pm_p->pcie_par_pminfo = cpwr_p;
        (cpwr_p->pwr_child_counters)[PCIE_UNKNOWN_INDEX] += comps;
}

/*
 * Remove the pm components of a child from our counters.
 */
static void
pcie_remove_comps(dev_info_t *dip, dev_info_t *cdip, pcie_pwr_t *pwr_p)
{
        int i;
        int *child_counters;

        ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
        if (!(PCIE_PMINFO(cdip)) || !PCIE_PAR_PMINFO(cdip)) {
                if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
                        /*
                         * Driver never made a PM call and we didn't create
                         * any counters for this device. This also means that
                         * hold made at the PRE_ATTACH time, still remains.
                         * Remove the hold now. The correct thing to do is to
                         * stay at full power when a child is at full power
                         * whether a driver is there or not. This will be
                         * implemented in the future.
                         */
                        pcie_pm_subrelease(dip, pwr_p);
                }
                return;
        }
        PCIE_DBG("%s(%d): pcie_remove_comps:counters decremented because of "
            "%s@%d\n", ddi_driver_name(dip), ddi_get_instance(dip),
            ddi_driver_name(cdip), ddi_get_instance(cdip));
        child_counters = PCIE_CHILD_COUNTERS(cdip);
        /*
         * Adjust the nexus counters. No need to adjust per child dip
         * counters as we are freeing the per child dip info.
         */
        for (i = 0; i < PCIE_MAX_PWR_LEVELS; i++) {
                ASSERT((pwr_p->pwr_counters)[i] >= child_counters[i]);
                (pwr_p->pwr_counters)[i] -= child_counters[i];
        }
        /* remove both parent pm info and pcie pminfo itself */
        kmem_free(PCIE_PAR_PMINFO(cdip), sizeof (pcie_pwr_child_t));
        kmem_free(PCIE_PMINFO(cdip), sizeof (pcie_pm_t));
        PCIE_RESET_PMINFO(cdip);
}

/*
 * Power management related initialization common to px and pcieb
 */
int
pwr_common_setup(dev_info_t *dip)
{
        pcie_pm_t               *pcie_pm_p;
        pcie_pwr_t              *pwr_p;
        int                     pminfo_created = 0;

        /* Create pminfo, if it doesn't exist already */
        if ((pcie_pm_p = PCIE_PMINFO(dip)) == NULL) {
                pcie_pm_p = (pcie_pm_t *)kmem_zalloc(
                    sizeof (pcie_pm_t), KM_SLEEP);
                PCIE_SET_PMINFO(dip, pcie_pm_p);
                pminfo_created = 1;
        }
        pwr_p = (pcie_pwr_t *)kmem_zalloc(sizeof (pcie_pwr_t), KM_SLEEP);
        mutex_init(&pwr_p->pwr_lock, NULL, MUTEX_DRIVER, NULL);
        /* Initialize the power level and default level support */
        pwr_p->pwr_func_lvl = PM_LEVEL_UNKNOWN;
        pwr_p->pwr_pmcaps = PCIE_DEFAULT_LEVEL_SUPPORTED;

        if (pcie_plat_pwr_setup(dip) != DDI_SUCCESS)
                goto pwr_common_err;

        pcie_pm_p->pcie_pwr_p = pwr_p;
        return (DDI_SUCCESS);

pwr_common_err:
        mutex_destroy(&pwr_p->pwr_lock);
        kmem_free(pwr_p, sizeof (pcie_pwr_t));
        if (pminfo_created) {
                PCIE_RESET_PMINFO(dip);
                kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
        }
        return (DDI_FAILURE);

}

/*
 * Undo whatever is done in pwr_common_setup. Called by px_detach or pxb_detach
 */
void
pwr_common_teardown(dev_info_t *dip)
{
        pcie_pm_t *pcie_pm_p = PCIE_PMINFO(dip);
        pcie_pwr_t *pwr_p;

        if (!pcie_pm_p || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
                return;

        pcie_plat_pwr_teardown(dip);
        mutex_destroy(&pwr_p->pwr_lock);
        pcie_pm_p->pcie_pwr_p = NULL;
        kmem_free(pwr_p, sizeof (pcie_pwr_t));
        /*
         * If the parent didn't store have any pm info about
         * this node, that means parent doesn't need pminfo when it handles
         * POST_DETACH for this node. For example, if dip is the dip of
         * root complex, then there is no parent pm info.
         */
        if (!PCIE_PAR_PMINFO(dip)) {
                kmem_free(pcie_pm_p, sizeof (pcie_pm_t));
                PCIE_RESET_PMINFO(dip);
        }
}

/*
 * Raises the power and marks itself busy.
 */
int
pcie_pm_hold(dev_info_t *dip)
{
        pcie_pwr_t *pwr_p;

        /* If no PM info or no device PM, return */
        if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
            !(PCIE_SUPPORTS_DEVICE_PM(dip)))
                return (DDI_SUCCESS);

        /*
         * If we are not at full power, then powerup.
         * Need to be at full power so that link can be
         * at L0. Similarly for PCI/PCI-X bus, it should be
         * at full power.
         */
        mutex_enter(&pwr_p->pwr_lock);
        ASSERT(pwr_p->pwr_hold >= 0);
        PCIE_DBG("%s(%d): pm_hold: incrementing hold \n",
            ddi_driver_name(dip), ddi_get_instance(dip));
        pwr_p->pwr_hold++;
        /* Mark itself busy, if it is not done already */
        if (!(pwr_p->pwr_flags & PCIE_PM_BUSY)) {
                PCIE_DBG("%s(%d): pm_hold: marking busy\n",
                    ddi_driver_name(dip), ddi_get_instance(dip));
                pwr_p->pwr_flags |= PCIE_PM_BUSY;
                (void) pm_busy_component(dip, 0);
        }
        if (pwr_p->pwr_func_lvl == PM_LEVEL_D0) {
                mutex_exit(&pwr_p->pwr_lock);
                return (DDI_SUCCESS);
        }
        mutex_exit(&pwr_p->pwr_lock);
        if (pm_raise_power(dip, 0, PM_LEVEL_D0) != DDI_SUCCESS) {
                PCIE_DBG("%s(%d): pm_hold: attempt to raise power "
                    "from %d to %d failed\n", ddi_driver_name(dip),
                    ddi_get_instance(dip), pwr_p->pwr_func_lvl,
                    PM_LEVEL_D0);
                pcie_pm_release(dip);
                return (DDI_FAILURE);
        }
        return (DDI_SUCCESS);
}

/*
 * Reverse the things done in pcie_pm_hold
 */
void
pcie_pm_release(dev_info_t *dip)
{
        pcie_pwr_t *pwr_p;

        /* If no PM info or no device PM, return */
        if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)) ||
            !(PCIE_SUPPORTS_DEVICE_PM(dip)))
                return;

        mutex_enter(&pwr_p->pwr_lock);
        pcie_pm_subrelease(dip, pwr_p);
        mutex_exit(&pwr_p->pwr_lock);
}

static void
pcie_pm_subrelease(dev_info_t *dip, pcie_pwr_t *pwr_p)
{
        int level;

        ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
        ASSERT(pwr_p->pwr_hold > 0);
        PCIE_DBG("%s(%d): pm_subrelease: decrementing hold \n",
            ddi_driver_name(dip), ddi_get_instance(dip));
        pwr_p->pwr_hold--;
        ASSERT(pwr_p->pwr_hold >= 0);
        ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
        level = pwr_level_allowed(pwr_p);
        if (pwr_p->pwr_hold == 0 && level < pwr_p->pwr_func_lvl) {
                PCIE_DBG("%s(%d): pm_subrelease: marking idle \n",
                    ddi_driver_name(dip), ddi_get_instance(dip));
                (void) pm_idle_component(dip, 0);
                pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
        }
}

/*
 * Called when the child makes the first power management call.
 * sets up the counters. All the components of the child device are
 * assumed to be at unknown level. It also releases the power hold
 *      pwr_p - parent's pwr_t
 *      cdip   - child's dip
 */
int
pcie_pm_add_child(dev_info_t *dip, dev_info_t *cdip)
{
        pcie_pwr_t *pwr_p;

        /* If no PM info, return */
        if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
                return (DDI_SUCCESS);

        ASSERT(MUTEX_HELD(&pwr_p->pwr_lock));
        ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
        pcie_add_comps(dip, cdip, pwr_p);

        /* If no device power management then return */
        if (!PCIE_SUPPORTS_DEVICE_PM(dip))
                return (DDI_SUCCESS);

        /*
         * We have informed PM that we are busy at PRE_ATTACH time for
         * this child. Release the hold and but don't clear the busy bit.
         * If a device never changes power, hold will not be released
         * and we stay at full power.
         */
        ASSERT(pwr_p->pwr_hold > 0);
        PCIE_DBG("%s(%d): pm_add_child: decrementing hold \n",
            ddi_driver_name(dip), ddi_get_instance(dip));
        pwr_p->pwr_hold--;
        /*
         * We must have made sure that busy bit
         * is set when we put the hold
         */
        ASSERT(pwr_p->pwr_flags & PCIE_PM_BUSY);
        return (DDI_SUCCESS);
}

/*
 * Adjust the counters when a child detaches
 * Marks itself idle if the idle conditions are met.
 * Called at POST_DETACH time
 */
int
pcie_pm_remove_child(dev_info_t *dip, dev_info_t *cdip)
{
        int *counters;
        int total;
        pcie_pwr_t *pwr_p;

        /* If no PM info, return */
        if (!PCIE_PMINFO(dip) || !(pwr_p = PCIE_NEXUS_PMINFO(dip)))
                return (DDI_SUCCESS);

        counters = pwr_p->pwr_counters;
        mutex_enter(&pwr_p->pwr_lock);
        pcie_remove_comps(dip, cdip, pwr_p);
        /* If no device power management then return */
        if (!PCIE_SUPPORTS_DEVICE_PM(dip)) {
                mutex_exit(&pwr_p->pwr_lock);
                return (DDI_SUCCESS);
        }
        total = (counters[PCIE_D0_INDEX] + counters[PCIE_UNKNOWN_INDEX] +
            counters[PCIE_D1_INDEX] + counters[PCIE_D2_INDEX] +
            counters[PCIE_D3_INDEX]);
        /*
         * Mark idle if either there are no children or our lowest
         * possible level is less than the current level. Mark idle
         * only if it is not already done.
         */
        if ((pwr_p->pwr_hold == 0) &&
            (!total || (pwr_level_allowed(pwr_p) < pwr_p->pwr_func_lvl))) {
                if (pwr_p->pwr_flags & PCIE_PM_BUSY) {
                        PCIE_DBG("%s(%d): pcie_bus_power: marking idle\n",
                            ddi_driver_name(dip), ddi_get_instance(dip));
                        (void) pm_idle_component(dip, 0);
                        pwr_p->pwr_flags &= ~PCIE_PM_BUSY;
                }
        }
        mutex_exit(&pwr_p->pwr_lock);
        return (DDI_SUCCESS);
}

boolean_t
pcie_is_pcie(dev_info_t *dip)
{
        pcie_bus_t *bus_p = PCIE_DIP2BUS(dip);
        ASSERT(bus_p);
        return (bus_p->bus_pcie_off != 0);
}

/*
 * Called by px_attach or pcieb_attach:: DDI_RESUME
 */
int
pcie_pwr_resume(dev_info_t *dip)
{
        dev_info_t *cdip;
        pcie_pwr_t *pwr_p = NULL;

#if defined(__x86)
        if (dip)
                return (DDI_SUCCESS);
#endif /* defined(__x86) */

        if (PCIE_PMINFO(dip))
                pwr_p = PCIE_NEXUS_PMINFO(dip);

        if (pwr_p) {
                /* Inform the PM framework that dip is at full power */
                if (PCIE_SUPPORTS_DEVICE_PM(dip)) {
                        ASSERT(pwr_p->pwr_func_lvl == PM_LEVEL_D0);
                        (void) pm_raise_power(dip, 0,
                            pwr_p->pwr_func_lvl);
                }
        }

        /*
         * Code taken from pci driver.
         * Restore config registers for children that did not save
         * their own registers.  Children pwr states are UNKNOWN after
         * a resume since it is possible for the PM framework to call
         * resume without an actual power cycle. (ie if suspend fails).
         */
        for (cdip = ddi_get_child(dip); cdip != NULL;
            cdip = ddi_get_next_sibling(cdip)) {
                boolean_t       is_pcie;

                /*
                 * Not interested in children who are not already
                 * init'ed.  They will be set up by init_child().
                 */
                if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
                        PCIE_DBG("%s(%d): "
                            "DDI_RESUME: skipping %s%d not in CF1\n",
                            ddi_driver_name(dip), ddi_get_instance(dip),
                            ddi_driver_name(cdip), ddi_get_instance(cdip));
                        continue;
                }

                /*
                 * Only restore config registers if saved by nexus.
                 */
                if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
                    "nexus-saved-config-regs") != 1)
                        continue;

                PCIE_DBG("%s(%d): "
                    "DDI_RESUME: nexus restoring %s%d config regs\n",
                    ddi_driver_name(dip), ddi_get_instance(dip),
                    ddi_driver_name(cdip), ddi_get_instance(cdip));

                /* clear errors left by OBP scrubbing */
                pcie_clear_errors(cdip);

                /* PCIe workaround: disable errors during 4K config resore */
                is_pcie = pcie_is_pcie(cdip);
                if (is_pcie)
                        pcie_disable_errors(cdip);
                (void) pci_restore_config_regs(cdip);
                if (is_pcie) {
                        pcie_enable_errors(cdip);
                        (void) pcie_enable_ce(cdip);
                }

                if (ndi_prop_remove(DDI_DEV_T_NONE, cdip,
                    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
                        PCIE_DBG("%s(%d): %s%d can't remove prop %s",
                            ddi_driver_name(dip), ddi_get_instance(dip),
                            ddi_driver_name(cdip), ddi_get_instance(cdip),
                            "nexus-saved-config-regs");
                }
        }
        return (DDI_SUCCESS);
}

/*
 * Called by pcie_detach or pcieb_detach:: DDI_SUSPEND
 */
int
pcie_pwr_suspend(dev_info_t *dip)
{
        dev_info_t *cdip;
        int i, *counters; /* per nexus counters */
        int *child_counters = NULL; /* per child dip counters */
        pcie_pwr_t *pwr_p = NULL;

#if defined(__x86)
        if (dip)
                return (DDI_SUCCESS);
#endif /* defined(__x86) */

        if (PCIE_PMINFO(dip))
                pwr_p = PCIE_NEXUS_PMINFO(dip);

        /*
         * Mark all children to be unknown and bring our power level
         * to full, if required. This is to avoid any panics while
         * accessing the child's config space.
         */
        if (pwr_p) {
                mutex_enter(&pwr_p->pwr_lock);
                if (PCIE_SUPPORTS_DEVICE_PM(dip) &&
                    pwr_p->pwr_func_lvl != PM_LEVEL_D0) {
                        mutex_exit(&pwr_p->pwr_lock);
                        if (pm_raise_power(dip, 0, PM_LEVEL_D0) !=
                            DDI_SUCCESS) {
                                PCIE_DBG("%s(%d): pwr_suspend: attempt "
                                    "to raise power from %d to %d "
                                    "failed\n", ddi_driver_name(dip),
                                    ddi_get_instance(dip), pwr_p->pwr_func_lvl,
                                    PM_LEVEL_D0);
                                return (DDI_FAILURE);
                        }
                        mutex_enter(&pwr_p->pwr_lock);
                }
                counters = pwr_p->pwr_counters;
                /*
                 * Update the nexus counters. At the resume time all
                 * components are considered to be at unknown level. Use the
                 * fact that counters for unknown level are at the end.
                 */
                for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
                        counters[PCIE_UNKNOWN_INDEX] += counters[i];
                        counters[i] = 0;
                }
                mutex_exit(&pwr_p->pwr_lock);
        }

        /*
         * Code taken from pci driver.
         * Save the state of the configuration headers of child
         * nodes.
         */
        for (cdip = ddi_get_child(dip); cdip != NULL;
            cdip = ddi_get_next_sibling(cdip)) {
                boolean_t       is_pcie;

                /*
                 * Not interested in children who are not already
                 * init'ed.  They will be set up in init_child().
                 */
                if (i_ddi_node_state(cdip) < DS_INITIALIZED) {
                        PCIE_DBG("%s(%d): DDI_SUSPEND: skipping "
                            "%s%d not in CF1\n", ddi_driver_name(dip),
                            ddi_get_instance(dip), ddi_driver_name(cdip),
                            ddi_get_instance(cdip));
                        continue;
                }
                /*
                 * Update per child dip counters, if any. Counters
                 * will not exist if the child is not power manageable
                 * or if its power entry is never invoked.
                 */
                if (PCIE_PMINFO(cdip) && PCIE_PAR_PMINFO(cdip))
                        child_counters = PCIE_CHILD_COUNTERS(cdip);
                if (child_counters && pwr_p) {
                        mutex_enter(&pwr_p->pwr_lock);
                        for (i = 0; i < PCIE_UNKNOWN_INDEX; i++) {
                                child_counters[PCIE_UNKNOWN_INDEX] +=
                                    child_counters[i];
                                child_counters[i] = 0;
                        }
                        mutex_exit(&pwr_p->pwr_lock);
                }

                /*
                 * Only save config registers if not already saved by child.
                 */
                if (ddi_prop_exists(DDI_DEV_T_ANY, cdip, DDI_PROP_DONTPASS,
                    SAVED_CONFIG_REGS) == 1) {
                        continue;
                }

                /*
                 * The nexus needs to save config registers.  Create a property
                 * so it knows to restore on resume.
                 */
                if (ndi_prop_create_boolean(DDI_DEV_T_NONE, cdip,
                    "nexus-saved-config-regs") != DDI_PROP_SUCCESS) {
                        PCIE_DBG("%s(%d): %s%d can't update prop %s",
                            ddi_driver_name(dip), ddi_get_instance(dip),
                            ddi_driver_name(cdip), ddi_get_instance(cdip),
                            "nexus-saved-config-regs");
                }
                PCIE_DBG("%s(%d): DDI_SUSPEND: saving config space for"
                    " %s%d\n", ddi_driver_name(dip), ddi_get_instance(dip),
                    ddi_driver_name(cdip), ddi_get_instance(cdip));

                /* PCIe workaround: disable errors during 4K config save */
                is_pcie = pcie_is_pcie(cdip);
                if (is_pcie)
                        pcie_disable_errors(cdip);
                (void) pci_save_config_regs(cdip);
                if (is_pcie) {
                        pcie_enable_errors(cdip);
                        (void) pcie_enable_ce(cdip);
                }
        }
        return (DDI_SUCCESS);
}

#ifdef DEBUG
/*
 * Description of bus_power_op.
 */
typedef struct pcie_buspwr_desc {
        pm_bus_power_op_t pwr_op;
        char *pwr_desc;
} pcie_buspwr_desc_t;

static pcie_buspwr_desc_t pcie_buspwr_desc[] = {
        {BUS_POWER_CHILD_PWRCHG, "CHILD_PWRCHG"},
        {BUS_POWER_NEXUS_PWRUP, "NEXUS_PWRUP"},
        {BUS_POWER_PRE_NOTIFICATION, "PRE_NOTIFICATION"},
        {BUS_POWER_POST_NOTIFICATION, "POST_NOTIFICATION"},
        {BUS_POWER_HAS_CHANGED, "HAS_CHANGED"},
        {BUS_POWER_NOINVOL, "NOINVOL"},
        {-1, NULL}
};

/*
 * Returns description of the bus_power_op.
 */
static char *
pcie_decode_pwr_op(pm_bus_power_op_t op)
{
        pcie_buspwr_desc_t *descp = pcie_buspwr_desc;

        for (; descp->pwr_desc; descp++) {
                if (op == descp->pwr_op)
                        return (descp->pwr_desc);
        }
        return ("UNKNOWN OP");
}
#endif