root/usr/src/uts/common/disp/cmt.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
 */

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/kmem.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/processor.h>
#include <sys/disp.h>
#include <sys/group.h>
#include <sys/pghw.h>
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>
#include <sys/cpu_pm.h>

/*
 * CMT scheduler / dispatcher support
 *
 * This file implements CMT scheduler support using Processor Groups.
 * The CMT processor group class creates and maintains the CMT class
 * specific processor group pg_cmt_t.
 *
 * ---------------------------- <-- pg_cmt_t *
 * | pghw_t                   |
 * ----------------------------
 * | CMT class specific data  |
 * | - hierarchy linkage      |
 * | - CMT load balancing data|
 * | - active CPU group/bitset|
 * ----------------------------
 *
 * The scheduler/dispatcher leverages knowledge of the performance
 * relevant CMT sharing relationships existing between cpus to implement
 * optimized affinity, load balancing, and coalescence policies.
 *
 * Load balancing policy seeks to improve performance by minimizing
 * contention over shared processor resources / facilities, Affinity
 * policies seek to improve cache and TLB utilization. Coalescence
 * policies improve resource utilization and ultimately power efficiency.
 *
 * The CMT PGs created by this class are already arranged into a
 * hierarchy (which is done in the pghw layer). To implement the top-down
 * CMT load balancing algorithm, the CMT PGs additionally maintain
 * parent, child and sibling hierarchy relationships.
 * Parent PGs always contain a superset of their children(s) resources,
 * each PG can have at most one parent, and siblings are the group of PGs
 * sharing the same parent.
 *
 * On UMA based systems, the CMT load balancing algorithm begins by balancing
 * load across the group of top level PGs in the system hierarchy.
 * On NUMA systems, the CMT load balancing algorithm balances load across the
 * group of top level PGs in each leaf lgroup...but for root homed threads,
 * is willing to balance against all the top level PGs in the system.
 *
 * Groups of top level PGs are maintained to implement the above, one for each
 * leaf lgroup (containing the top level PGs in that lgroup), and one (for the
 * root lgroup) that contains all the top level PGs in the system.
 */
static cmt_lgrp_t       *cmt_lgrps = NULL;      /* cmt_lgrps list head */
static cmt_lgrp_t       *cpu0_lgrp = NULL;      /* boot CPU's initial lgrp */
                                                /* used for null_proc_lpa */
cmt_lgrp_t              *cmt_root = NULL;       /* Reference to root cmt pg */

static int              is_cpu0 = 1; /* true if this is boot CPU context */

/*
 * Array of hardware sharing relationships that are blacklisted.
 * CMT scheduling optimizations won't be performed for blacklisted sharing
 * relationships.
 */
static int              cmt_hw_blacklisted[PGHW_NUM_COMPONENTS];

/*
 * Set this to non-zero to disable CMT scheduling
 * This must be done via kmdb -d, as /etc/system will be too late
 */
int                     cmt_sched_disabled = 0;

/*
 * Status codes for CMT lineage validation
 * See pg_cmt_lineage_validate() below
 */
typedef enum cmt_lineage_validation {
        CMT_LINEAGE_VALID,
        CMT_LINEAGE_NON_CONCENTRIC,
        CMT_LINEAGE_PG_SPANS_LGRPS,
        CMT_LINEAGE_NON_PROMOTABLE,
        CMT_LINEAGE_REPAIRED,
        CMT_LINEAGE_UNRECOVERABLE
} cmt_lineage_validation_t;

/*
 * Status of the current lineage under construction.
 * One must be holding cpu_lock to change this.
 */
cmt_lineage_validation_t        cmt_lineage_status = CMT_LINEAGE_VALID;

/*
 * Power domain definitions (on x86) are defined by ACPI, and
 * therefore may be subject to BIOS bugs.
 */
#define PG_CMT_HW_SUSPECT(hw)   PGHW_IS_PM_DOMAIN(hw)

/*
 * Macro to test if PG is managed by the CMT PG class
 */
#define IS_CMT_PG(pg)   (((pg_t *)(pg))->pg_class->pgc_id == pg_cmt_class_id)

static pg_cid_t         pg_cmt_class_id;                /* PG class id */

static pg_t             *pg_cmt_alloc();
static void             pg_cmt_free(pg_t *);
static void             pg_cmt_cpu_init(cpu_t *, cpu_pg_t *);
static void             pg_cmt_cpu_fini(cpu_t *, cpu_pg_t *);
static void             pg_cmt_cpu_active(cpu_t *);
static void             pg_cmt_cpu_inactive(cpu_t *);
static void             pg_cmt_cpupart_in(cpu_t *, cpupart_t *);
static void             pg_cmt_cpupart_move(cpu_t *, cpupart_t *, cpupart_t *);
static char             *pg_cmt_policy_name(pg_t *);
static void             pg_cmt_hier_sort(pg_cmt_t **, int);
static pg_cmt_t         *pg_cmt_hier_rank(pg_cmt_t *, pg_cmt_t *);
static int              pg_cmt_cpu_belongs(pg_t *, cpu_t *);
static int              pg_cmt_hw(pghw_type_t);
static cmt_lgrp_t       *pg_cmt_find_lgrp(lgrp_handle_t);
static cmt_lgrp_t       *pg_cmt_lgrp_create(lgrp_handle_t);
static void             cmt_ev_thread_swtch(pg_t *, cpu_t *, hrtime_t,
                            kthread_t *, kthread_t *);
static void             cmt_ev_thread_swtch_pwr(pg_t *, cpu_t *, hrtime_t,
                            kthread_t *, kthread_t *);
static void             cmt_ev_thread_remain_pwr(pg_t *, cpu_t *, kthread_t *);
static cmt_lineage_validation_t pg_cmt_lineage_validate(pg_cmt_t **, int *,
                            cpu_pg_t *);

/*
 * CMT PG ops
 */
struct pg_ops pg_ops_cmt = {
        pg_cmt_alloc,
        pg_cmt_free,
        pg_cmt_cpu_init,
        pg_cmt_cpu_fini,
        pg_cmt_cpu_active,
        pg_cmt_cpu_inactive,
        pg_cmt_cpupart_in,
        NULL,                   /* cpupart_out */
        pg_cmt_cpupart_move,
        pg_cmt_cpu_belongs,
        pg_cmt_policy_name,
};

/*
 * Initialize the CMT PG class
 */
void
pg_cmt_class_init(void)
{
        if (cmt_sched_disabled)
                return;

        pg_cmt_class_id = pg_class_register("cmt", &pg_ops_cmt, PGR_PHYSICAL);
}

/*
 * Called to indicate a new CPU has started up so
 * that either t0 or the slave startup thread can
 * be accounted for.
 */
void
pg_cmt_cpu_startup(cpu_t *cp)
{
        pg_ev_thread_swtch(cp, gethrtime_unscaled(), cp->cpu_idle_thread,
            cp->cpu_thread);
}

/*
 * Return non-zero if thread can migrate between "from" and "to"
 * without a performance penalty
 */
int
pg_cmt_can_migrate(cpu_t *from, cpu_t *to)
{
        if (from->cpu_physid->cpu_cacheid ==
            to->cpu_physid->cpu_cacheid)
                return (1);
        return (0);
}

/*
 * CMT class specific PG allocation
 */
static pg_t *
pg_cmt_alloc(void)
{
        return (kmem_zalloc(sizeof (pg_cmt_t), KM_NOSLEEP));
}

/*
 * Class specific PG de-allocation
 */
static void
pg_cmt_free(pg_t *pg)
{
        ASSERT(pg != NULL);
        ASSERT(IS_CMT_PG(pg));

        kmem_free((pg_cmt_t *)pg, sizeof (pg_cmt_t));
}

/*
 * Given a hardware sharing relationship, return which dispatcher
 * policies should be implemented to optimize performance and efficiency
 */
static pg_cmt_policy_t
pg_cmt_policy(pghw_type_t hw)
{
        pg_cmt_policy_t p;

        /*
         * Give the platform a chance to override the default
         */
        if ((p = pg_plat_cmt_policy(hw)) != CMT_NO_POLICY)
                return (p);

        switch (hw) {
        case PGHW_IPIPE:
        case PGHW_FPU:
        case PGHW_PROCNODE:
        case PGHW_CHIP:
                return (CMT_BALANCE);
        case PGHW_CACHE:
                return (CMT_AFFINITY | CMT_BALANCE);
        case PGHW_POW_ACTIVE:
        case PGHW_POW_IDLE:
                return (CMT_BALANCE);
        default:
                return (CMT_NO_POLICY);
        }
}

/*
 * Rank the importance of optimizing for the pg1 relationship vs.
 * the pg2 relationship.
 */
static pg_cmt_t *
pg_cmt_hier_rank(pg_cmt_t *pg1, pg_cmt_t *pg2)
{
        pghw_type_t hw1 = ((pghw_t *)pg1)->pghw_hw;
        pghw_type_t hw2 = ((pghw_t *)pg2)->pghw_hw;

        /*
         * A power domain is only important if CPUPM is enabled.
         */
        if (cpupm_get_policy() == CPUPM_POLICY_DISABLED) {
                if (PGHW_IS_PM_DOMAIN(hw1) && !PGHW_IS_PM_DOMAIN(hw2))
                        return (pg2);
                if (PGHW_IS_PM_DOMAIN(hw2) && !PGHW_IS_PM_DOMAIN(hw1))
                        return (pg1);
        }

        /*
         * Otherwise, ask the platform
         */
        if (pg_plat_hw_rank(hw1, hw2) == hw1)
                return (pg1);
        else
                return (pg2);
}

/*
 * Initialize CMT callbacks for the given PG
 */
static void
cmt_callback_init(pg_t *pg)
{
        /*
         * Stick with the default callbacks if there isn't going to be
         * any CMT thread placement optimizations implemented.
         */
        if (((pg_cmt_t *)pg)->cmt_policy == CMT_NO_POLICY)
                return;

        switch (((pghw_t *)pg)->pghw_hw) {
        case PGHW_POW_ACTIVE:
                pg->pg_cb.thread_swtch = cmt_ev_thread_swtch_pwr;
                pg->pg_cb.thread_remain = cmt_ev_thread_remain_pwr;
                break;
        default:
                pg->pg_cb.thread_swtch = cmt_ev_thread_swtch;

        }
}

/*
 * Promote PG above it's current parent.
 * This is only legal if PG has an equal or greater number of CPUs than its
 * parent.
 *
 * This routine operates on the CPU specific processor group data (for the CPUs
 * in the PG being promoted), and may be invoked from a context where one CPU's
 * PG data is under construction. In this case the argument "pgdata", if not
 * NULL, is a reference to the CPU's under-construction PG data.
 */
static void
cmt_hier_promote(pg_cmt_t *pg, cpu_pg_t *pgdata)
{
        pg_cmt_t        *parent;
        group_t         *children;
        cpu_t           *cpu;
        group_iter_t    iter;
        pg_cpu_itr_t    cpu_iter;
        int             r;
        int             err;
        int             nchildren;

        ASSERT(MUTEX_HELD(&cpu_lock));

        parent = pg->cmt_parent;
        if (parent == NULL) {
                /*
                 * Nothing to do
                 */
                return;
        }

        ASSERT(PG_NUM_CPUS((pg_t *)pg) >= PG_NUM_CPUS((pg_t *)parent));

        /*
         * We're changing around the hierarchy, which is actively traversed
         * by the dispatcher. Pause CPUS to ensure exclusivity.
         */
        pause_cpus(NULL, NULL);

        /*
         * If necessary, update the parent's sibling set, replacing parent
         * with PG.
         */
        if (parent->cmt_siblings) {
                if (group_remove(parent->cmt_siblings, parent, GRP_NORESIZE)
                    != -1) {
                        r = group_add(parent->cmt_siblings, pg, GRP_NORESIZE);
                        ASSERT(r != -1);
                }
        }

        /*
         * If the parent is at the top of the hierarchy, replace it's entry
         * in the root lgroup's group of top level PGs.
         */
        if (parent->cmt_parent == NULL &&
            parent->cmt_siblings != &cmt_root->cl_pgs) {
                if (group_remove(&cmt_root->cl_pgs, parent, GRP_NORESIZE)
                    != -1) {
                        r = group_add(&cmt_root->cl_pgs, pg, GRP_NORESIZE);
                        ASSERT(r != -1);
                }
        }

        /*
         * We assume (and therefore assert) that the PG being promoted is an
         * only child of it's parent. Update the parent's children set
         * replacing PG's entry with the parent (since the parent is becoming
         * the child). Then have PG and the parent swap children sets and
         * children counts.
         */
        ASSERT(GROUP_SIZE(parent->cmt_children) <= 1);
        if (group_remove(parent->cmt_children, pg, GRP_NORESIZE) != -1) {
                r = group_add(parent->cmt_children, parent, GRP_NORESIZE);
                ASSERT(r != -1);
        }

        children = pg->cmt_children;
        pg->cmt_children = parent->cmt_children;
        parent->cmt_children = children;

        nchildren = pg->cmt_nchildren;
        pg->cmt_nchildren = parent->cmt_nchildren;
        parent->cmt_nchildren = nchildren;

        /*
         * Update the sibling references for PG and it's parent
         */
        pg->cmt_siblings = parent->cmt_siblings;
        parent->cmt_siblings = pg->cmt_children;

        /*
         * Update any cached lineages in the per CPU pg data.
         */
        PG_CPU_ITR_INIT(pg, cpu_iter);
        while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
                int             idx;
                int             sz;
                pg_cmt_t        *cpu_pg;
                cpu_pg_t        *pgd;   /* CPU's PG data */

                /*
                 * The CPU's whose lineage is under construction still
                 * references the bootstrap CPU PG data structure.
                 */
                if (pg_cpu_is_bootstrapped(cpu))
                        pgd = pgdata;
                else
                        pgd = cpu->cpu_pg;

                /*
                 * Iterate over the CPU's PGs updating the children
                 * of the PG being promoted, since they have a new parent.
                 */
                group_iter_init(&iter);
                while ((cpu_pg = group_iterate(&pgd->cmt_pgs, &iter)) != NULL) {
                        if (cpu_pg->cmt_parent == pg) {
                                cpu_pg->cmt_parent = parent;
                        }
                }

                /*
                 * Update the CMT load balancing lineage
                 */
                if ((idx = group_find(&pgd->cmt_pgs, (void *)pg)) == -1) {
                        /*
                         * Unless this is the CPU who's lineage is being
                         * constructed, the PG being promoted should be
                         * in the lineage.
                         */
                        ASSERT(pg_cpu_is_bootstrapped(cpu));
                        continue;
                }

                ASSERT(idx > 0);
                ASSERT(GROUP_ACCESS(&pgd->cmt_pgs, idx - 1) == parent);

                /*
                 * Have the child and the parent swap places in the CPU's
                 * lineage
                 */
                group_remove_at(&pgd->cmt_pgs, idx);
                group_remove_at(&pgd->cmt_pgs, idx - 1);
                err = group_add_at(&pgd->cmt_pgs, parent, idx);
                ASSERT(err == 0);
                err = group_add_at(&pgd->cmt_pgs, pg, idx - 1);
                ASSERT(err == 0);

                /*
                 * Ensure cmt_lineage references CPU's leaf PG.
                 * Since cmt_pgs is top-down ordered, the bottom is the last
                 * element.
                 */
                if ((sz = GROUP_SIZE(&pgd->cmt_pgs)) > 0)
                        pgd->cmt_lineage = GROUP_ACCESS(&pgd->cmt_pgs, sz - 1);
        }

        /*
         * Update the parent references for PG and it's parent
         */
        pg->cmt_parent = parent->cmt_parent;
        parent->cmt_parent = pg;

        start_cpus();
}

/*
 * CMT class callback for a new CPU entering the system
 *
 * This routine operates on the CPU specific processor group data (for the CPU
 * being initialized). The argument "pgdata" is a reference to the CPU's PG
 * data to be constructed.
 *
 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
 * references a "bootstrap" structure. pg_cmt_cpu_init() and the routines it
 * calls must be careful to operate only on the "pgdata" argument, and not
 * cp->cpu_pg.
 */
static void
pg_cmt_cpu_init(cpu_t *cp, cpu_pg_t *pgdata)
{
        pg_cmt_t        *pg;
        group_t         *cmt_pgs;
        int             levels, level;
        pghw_type_t     hw;
        pg_t            *pg_cache = NULL;
        pg_cmt_t        *cpu_cmt_hier[PGHW_NUM_COMPONENTS];
        lgrp_handle_t   lgrp_handle;
        cmt_lgrp_t      *lgrp;
        cmt_lineage_validation_t        lineage_status;

        ASSERT(MUTEX_HELD(&cpu_lock));
        ASSERT(pg_cpu_is_bootstrapped(cp));

        if (cmt_sched_disabled)
                return;

        /*
         * A new CPU is coming into the system.
         * Interrogate the platform to see if the CPU
         * has any performance or efficiency relevant
         * sharing relationships
         */
        cmt_pgs = &pgdata->cmt_pgs;
        pgdata->cmt_lineage = NULL;

        bzero(cpu_cmt_hier, sizeof (cpu_cmt_hier));
        levels = 0;
        for (hw = PGHW_START; hw < PGHW_NUM_COMPONENTS; hw++) {

                pg_cmt_policy_t policy;

                /*
                 * We're only interested in the hw sharing relationships
                 * for which we know how to optimize.
                 */
                policy = pg_cmt_policy(hw);
                if (policy == CMT_NO_POLICY ||
                    pg_plat_hw_shared(cp, hw) == 0)
                        continue;

                /*
                 * We will still create the PGs for hardware sharing
                 * relationships that have been blacklisted, but won't
                 * implement CMT thread placement optimizations against them.
                 */
                if (cmt_hw_blacklisted[hw] == 1)
                        policy = CMT_NO_POLICY;

                /*
                 * Find (or create) the PG associated with
                 * the hw sharing relationship in which cp
                 * belongs.
                 *
                 * Determine if a suitable PG already
                 * exists, or if one needs to be created.
                 */
                pg = (pg_cmt_t *)pghw_place_cpu(cp, hw);
                if (pg == NULL) {
                        /*
                         * Create a new one.
                         * Initialize the common...
                         */
                        pg = (pg_cmt_t *)pg_create(pg_cmt_class_id);

                        /* ... physical ... */
                        pghw_init((pghw_t *)pg, cp, hw);

                        /*
                         * ... and CMT specific portions of the
                         * structure.
                         */
                        pg->cmt_policy = policy;

                        /* CMT event callbacks */
                        cmt_callback_init((pg_t *)pg);

                        bitset_init(&pg->cmt_cpus_actv_set);
                        group_create(&pg->cmt_cpus_actv);
                } else {
                        ASSERT(IS_CMT_PG(pg));
                }

                ((pghw_t *)pg)->pghw_generation++;

                /* Add the CPU to the PG */
                pg_cpu_add((pg_t *)pg, cp, pgdata);

                /*
                 * Ensure capacity of the active CPU group/bitset
                 */
                group_expand(&pg->cmt_cpus_actv,
                    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

                if (cp->cpu_seqid >=
                    bitset_capacity(&pg->cmt_cpus_actv_set)) {
                        bitset_resize(&pg->cmt_cpus_actv_set,
                            cp->cpu_seqid + 1);
                }

                /*
                 * Build a lineage of CMT PGs for load balancing / coalescence
                 */
                if (policy & (CMT_BALANCE | CMT_COALESCE)) {
                        cpu_cmt_hier[levels++] = pg;
                }

                /* Cache this for later */
                if (hw == PGHW_CACHE)
                        pg_cache = (pg_t *)pg;
        }

        group_expand(cmt_pgs, levels);

        if (cmt_root == NULL)
                cmt_root = pg_cmt_lgrp_create(lgrp_plat_root_hand());

        /*
         * Find the lgrp that encapsulates this CPU's CMT hierarchy
         */
        lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);
        if ((lgrp = pg_cmt_find_lgrp(lgrp_handle)) == NULL)
                lgrp = pg_cmt_lgrp_create(lgrp_handle);

        /*
         * Ascendingly sort the PGs in the lineage by number of CPUs
         */
        pg_cmt_hier_sort(cpu_cmt_hier, levels);

        /*
         * Examine the lineage and validate it.
         * This routine will also try to fix the lineage along with the
         * rest of the PG hierarchy should it detect an issue.
         *
         * If it returns anything other than VALID or REPAIRED, an
         * unrecoverable error has occurred, and we cannot proceed.
         */
        lineage_status = pg_cmt_lineage_validate(cpu_cmt_hier, &levels, pgdata);
        if ((lineage_status != CMT_LINEAGE_VALID) &&
            (lineage_status != CMT_LINEAGE_REPAIRED)) {
                /*
                 * In the case of an unrecoverable error where CMT scheduling
                 * has been disabled, assert that the under construction CPU's
                 * PG data has an empty CMT load balancing lineage.
                 */
                ASSERT((cmt_sched_disabled == 0) ||
                    (GROUP_SIZE(&(pgdata->cmt_pgs)) == 0));
                return;
        }

        /*
         * For existing PGs in the lineage, verify that the parent is
         * correct, as the generation in the lineage may have changed
         * as a result of the sorting. Start the traversal at the top
         * of the lineage, moving down.
         */
        for (level = levels - 1; level >= 0; ) {
                int reorg;

                reorg = 0;
                pg = cpu_cmt_hier[level];

                /*
                 * Promote PGs at an incorrect generation into place.
                 */
                while (pg->cmt_parent &&
                    pg->cmt_parent != cpu_cmt_hier[level + 1]) {
                        cmt_hier_promote(pg, pgdata);
                        reorg++;
                }
                if (reorg > 0)
                        level = levels - 1;
                else
                        level--;
        }

        /*
         * For each of the PGs in the CPU's lineage:
         *      - Add an entry in the CPU sorted CMT PG group
         *        which is used for top down CMT load balancing
         *      - Tie the PG into the CMT hierarchy by connecting
         *        it to it's parent and siblings.
         */
        for (level = 0; level < levels; level++) {
                uint_t          children;
                int             err;

                pg = cpu_cmt_hier[level];
                err = group_add_at(cmt_pgs, pg, levels - level - 1);
                ASSERT(err == 0);

                if (level == 0)
                        pgdata->cmt_lineage = (pg_t *)pg;

                if (pg->cmt_siblings != NULL) {
                        /* Already initialized */
                        ASSERT(pg->cmt_parent == NULL ||
                            pg->cmt_parent == cpu_cmt_hier[level + 1]);
                        ASSERT(pg->cmt_siblings == &lgrp->cl_pgs ||
                            ((pg->cmt_parent != NULL) &&
                            pg->cmt_siblings == pg->cmt_parent->cmt_children));
                        continue;
                }

                if ((level + 1) == levels) {
                        pg->cmt_parent = NULL;

                        pg->cmt_siblings = &lgrp->cl_pgs;
                        children = ++lgrp->cl_npgs;
                        if (cmt_root != lgrp)
                                cmt_root->cl_npgs++;
                } else {
                        pg->cmt_parent = cpu_cmt_hier[level + 1];

                        /*
                         * A good parent keeps track of their children.
                         * The parent's children group is also the PG's
                         * siblings.
                         */
                        if (pg->cmt_parent->cmt_children == NULL) {
                                pg->cmt_parent->cmt_children =
                                    kmem_zalloc(sizeof (group_t), KM_SLEEP);
                                group_create(pg->cmt_parent->cmt_children);
                        }
                        pg->cmt_siblings = pg->cmt_parent->cmt_children;
                        children = ++pg->cmt_parent->cmt_nchildren;
                }

                group_expand(pg->cmt_siblings, children);
                group_expand(&cmt_root->cl_pgs, cmt_root->cl_npgs);
        }

        /*
         * Cache the chip and core IDs in the cpu_t->cpu_physid structure
         * for fast lookups later.
         */
        if (cp->cpu_physid) {
                cp->cpu_physid->cpu_chipid =
                    pg_plat_hw_instance_id(cp, PGHW_CHIP);
                cp->cpu_physid->cpu_coreid = pg_plat_get_core_id(cp);

                /*
                 * If this cpu has a PG representing shared cache, then set
                 * cpu_cacheid to that PG's logical id
                 */
                if (pg_cache)
                        cp->cpu_physid->cpu_cacheid = pg_cache->pg_id;
        }

        /* CPU0 only initialization */
        if (is_cpu0) {
                is_cpu0 = 0;
                cpu0_lgrp = lgrp;
        }

}

/*
 * Class callback when a CPU is leaving the system (deletion)
 *
 * "pgdata" is a reference to the CPU's PG data to be deconstructed.
 *
 * cp->cpu_pg is used by the dispatcher to access the CPU's PG data
 * references a "bootstrap" structure across this function's invocation.
 * pg_cmt_cpu_fini() and the routines it calls must be careful to operate only
 * on the "pgdata" argument, and not cp->cpu_pg.
 */
static void
pg_cmt_cpu_fini(cpu_t *cp, cpu_pg_t *pgdata)
{
        group_iter_t    i;
        pg_cmt_t        *pg;
        group_t         *pgs, *cmt_pgs;
        lgrp_handle_t   lgrp_handle;
        cmt_lgrp_t      *lgrp;

        if (cmt_sched_disabled)
                return;

        ASSERT(pg_cpu_is_bootstrapped(cp));

        pgs = &pgdata->pgs;
        cmt_pgs = &pgdata->cmt_pgs;

        /*
         * Find the lgroup that encapsulates this CPU's CMT hierarchy
         */
        lgrp_handle = lgrp_plat_cpu_to_hand(cp->cpu_id);

        lgrp = pg_cmt_find_lgrp(lgrp_handle);
        if (ncpus == 1 && lgrp != cpu0_lgrp) {
                /*
                 * One might wonder how we could be deconfiguring the
                 * only CPU in the system.
                 *
                 * On Starcat systems when null_proc_lpa is detected,
                 * the boot CPU (which is already configured into a leaf
                 * lgroup), is moved into the root lgroup. This is done by
                 * deconfiguring it from both lgroups and processor
                 * groups), and then later reconfiguring it back in.  This
                 * call to pg_cmt_cpu_fini() is part of that deconfiguration.
                 *
                 * This special case is detected by noting that the platform
                 * has changed the CPU's lgrp affiliation (since it now
                 * belongs in the root). In this case, use the cmt_lgrp_t
                 * cached for the boot CPU, since this is what needs to be
                 * torn down.
                 */
                lgrp = cpu0_lgrp;
        }

        ASSERT(lgrp != NULL);

        /*
         * First, clean up anything load balancing specific for each of
         * the CPU's PGs that participated in CMT load balancing
         */
        pg = (pg_cmt_t *)pgdata->cmt_lineage;
        while (pg != NULL) {

                ((pghw_t *)pg)->pghw_generation++;

                /*
                 * Remove the PG from the CPU's load balancing lineage
                 */
                (void) group_remove(cmt_pgs, pg, GRP_RESIZE);

                /*
                 * If it's about to become empty, destroy it's children
                 * group, and remove it's reference from it's siblings.
                 * This is done here (rather than below) to avoid removing
                 * our reference from a PG that we just eliminated.
                 */
                if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 1) {
                        if (pg->cmt_children != NULL)
                                group_destroy(pg->cmt_children);
                        if (pg->cmt_siblings != NULL) {
                                if (pg->cmt_siblings == &lgrp->cl_pgs)
                                        lgrp->cl_npgs--;
                                else
                                        pg->cmt_parent->cmt_nchildren--;
                        }
                }
                pg = pg->cmt_parent;
        }
        ASSERT(GROUP_SIZE(cmt_pgs) == 0);

        /*
         * Now that the load balancing lineage updates have happened,
         * remove the CPU from all it's PGs (destroying any that become
         * empty).
         */
        group_iter_init(&i);
        while ((pg = group_iterate(pgs, &i)) != NULL) {
                if (IS_CMT_PG(pg) == 0)
                        continue;

                pg_cpu_delete((pg_t *)pg, cp, pgdata);
                /*
                 * Deleting the CPU from the PG changes the CPU's
                 * PG group over which we are actively iterating
                 * Re-initialize the iteration
                 */
                group_iter_init(&i);

                if (GROUP_SIZE(&((pg_t *)pg)->pg_cpus) == 0) {

                        /*
                         * The PG has become zero sized, so destroy it.
                         */
                        group_destroy(&pg->cmt_cpus_actv);
                        bitset_fini(&pg->cmt_cpus_actv_set);
                        pghw_fini((pghw_t *)pg);

                        pg_destroy((pg_t *)pg);
                }
        }
}

/*
 * Class callback when a CPU is entering a cpu partition
 */
static void
pg_cmt_cpupart_in(cpu_t *cp, cpupart_t *pp)
{
        group_t         *pgs;
        pg_t            *pg;
        group_iter_t    i;

        ASSERT(MUTEX_HELD(&cpu_lock));

        if (cmt_sched_disabled)
                return;

        pgs = &cp->cpu_pg->pgs;

        /*
         * Ensure that the new partition's PG bitset
         * is large enough for all CMT PG's to which cp
         * belongs
         */
        group_iter_init(&i);
        while ((pg = group_iterate(pgs, &i)) != NULL) {
                if (IS_CMT_PG(pg) == 0)
                        continue;

                if (bitset_capacity(&pp->cp_cmt_pgs) <= pg->pg_id)
                        bitset_resize(&pp->cp_cmt_pgs, pg->pg_id + 1);
        }
}

/*
 * Class callback when a CPU is actually moving partitions
 */
static void
pg_cmt_cpupart_move(cpu_t *cp, cpupart_t *oldpp, cpupart_t *newpp)
{
        cpu_t           *cpp;
        group_t         *pgs;
        pg_t            *pg;
        group_iter_t    pg_iter;
        pg_cpu_itr_t    cpu_iter;
        boolean_t       found;

        ASSERT(MUTEX_HELD(&cpu_lock));

        if (cmt_sched_disabled)
                return;

        pgs = &cp->cpu_pg->pgs;
        group_iter_init(&pg_iter);

        /*
         * Iterate over the CPUs CMT PGs
         */
        while ((pg = group_iterate(pgs, &pg_iter)) != NULL) {

                if (IS_CMT_PG(pg) == 0)
                        continue;

                /*
                 * Add the PG to the bitset in the new partition.
                 */
                bitset_add(&newpp->cp_cmt_pgs, pg->pg_id);

                /*
                 * Remove the PG from the bitset in the old partition
                 * if the last of the PG's CPUs have left.
                 */
                found = B_FALSE;
                PG_CPU_ITR_INIT(pg, cpu_iter);
                while ((cpp = pg_cpu_next(&cpu_iter)) != NULL) {
                        if (cpp == cp)
                                continue;
                        if (CPU_ACTIVE(cpp) &&
                            cpp->cpu_part->cp_id == oldpp->cp_id) {
                                found = B_TRUE;
                                break;
                        }
                }
                if (!found)
                        bitset_del(&cp->cpu_part->cp_cmt_pgs, pg->pg_id);
        }
}

/*
 * Class callback when a CPU becomes active (online)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_active(cpu_t *cp)
{
        int             err;
        group_iter_t    i;
        pg_cmt_t        *pg;
        group_t         *pgs;

        ASSERT(MUTEX_HELD(&cpu_lock));

        if (cmt_sched_disabled)
                return;

        pgs = &cp->cpu_pg->pgs;
        group_iter_init(&i);

        /*
         * Iterate over the CPU's PGs
         */
        while ((pg = group_iterate(pgs, &i)) != NULL) {

                if (IS_CMT_PG(pg) == 0)
                        continue;

                /*
                 * Move to the next generation since topology is changing
                 */
                ((pghw_t *)pg)->pghw_generation++;

                err = group_add(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
                ASSERT(err == 0);

                /*
                 * If this is the first active CPU in the PG, and it
                 * represents a hardware sharing relationship over which
                 * CMT load balancing is performed, add it as a candidate
                 * for balancing with it's siblings.
                 */
                if (GROUP_SIZE(&pg->cmt_cpus_actv) == 1 &&
                    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
                        err = group_add(pg->cmt_siblings, pg, GRP_NORESIZE);
                        ASSERT(err == 0);

                        /*
                         * If this is a top level PG, add it as a balancing
                         * candidate when balancing within the root lgroup.
                         */
                        if (pg->cmt_parent == NULL &&
                            pg->cmt_siblings != &cmt_root->cl_pgs) {
                                err = group_add(&cmt_root->cl_pgs, pg,
                                    GRP_NORESIZE);
                                ASSERT(err == 0);
                        }
                }

                /*
                 * Notate the CPU in the PGs active CPU bitset.
                 * Also notate the PG as being active in it's associated
                 * partition
                 */
                bitset_add(&pg->cmt_cpus_actv_set, cp->cpu_seqid);
                bitset_add(&cp->cpu_part->cp_cmt_pgs, ((pg_t *)pg)->pg_id);
        }
}

/*
 * Class callback when a CPU goes inactive (offline)
 *
 * This is called in a context where CPUs are paused
 */
static void
pg_cmt_cpu_inactive(cpu_t *cp)
{
        int             err;
        group_t         *pgs;
        pg_cmt_t        *pg;
        cpu_t           *cpp;
        group_iter_t    i;
        pg_cpu_itr_t    cpu_itr;
        boolean_t       found;

        ASSERT(MUTEX_HELD(&cpu_lock));

        if (cmt_sched_disabled)
                return;

        pgs = &cp->cpu_pg->pgs;
        group_iter_init(&i);

        while ((pg = group_iterate(pgs, &i)) != NULL) {

                if (IS_CMT_PG(pg) == 0)
                        continue;

                /*
                 * Move to the next generation since topology is changing
                 */
                ((pghw_t *)pg)->pghw_generation++;

                /*
                 * Remove the CPU from the CMT PGs active CPU group
                 * bitmap
                 */
                err = group_remove(&pg->cmt_cpus_actv, cp, GRP_NORESIZE);
                ASSERT(err == 0);

                bitset_del(&pg->cmt_cpus_actv_set, cp->cpu_seqid);

                /*
                 * If there are no more active CPUs in this PG over which
                 * load was balanced, remove it as a balancing candidate.
                 */
                if (GROUP_SIZE(&pg->cmt_cpus_actv) == 0 &&
                    (pg->cmt_policy & (CMT_BALANCE | CMT_COALESCE))) {
                        err = group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
                        ASSERT(err == 0);

                        if (pg->cmt_parent == NULL &&
                            pg->cmt_siblings != &cmt_root->cl_pgs) {
                                err = group_remove(&cmt_root->cl_pgs, pg,
                                    GRP_NORESIZE);
                                ASSERT(err == 0);
                        }
                }

                /*
                 * Assert the number of active CPUs does not exceed
                 * the total number of CPUs in the PG
                 */
                ASSERT(GROUP_SIZE(&pg->cmt_cpus_actv) <=
                    GROUP_SIZE(&((pg_t *)pg)->pg_cpus));

                /*
                 * Update the PG bitset in the CPU's old partition
                 */
                found = B_FALSE;
                PG_CPU_ITR_INIT(pg, cpu_itr);
                while ((cpp = pg_cpu_next(&cpu_itr)) != NULL) {
                        if (cpp == cp)
                                continue;
                        if (CPU_ACTIVE(cpp) &&
                            cpp->cpu_part->cp_id == cp->cpu_part->cp_id) {
                                found = B_TRUE;
                                break;
                        }
                }
                if (!found) {
                        bitset_del(&cp->cpu_part->cp_cmt_pgs,
                            ((pg_t *)pg)->pg_id);
                }
        }
}

/*
 * Return non-zero if the CPU belongs in the given PG
 */
static int
pg_cmt_cpu_belongs(pg_t *pg, cpu_t *cp)
{
        cpu_t   *pg_cpu;

        pg_cpu = GROUP_ACCESS(&pg->pg_cpus, 0);

        ASSERT(pg_cpu != NULL);

        /*
         * The CPU belongs if, given the nature of the hardware sharing
         * relationship represented by the PG, the CPU has that
         * relationship with some other CPU already in the PG
         */
        if (pg_plat_cpus_share(cp, pg_cpu, ((pghw_t *)pg)->pghw_hw))
                return (1);

        return (0);
}

/*
 * Sort the CPUs CMT hierarchy, where "size" is the number of levels.
 */
static void
pg_cmt_hier_sort(pg_cmt_t **hier, int size)
{
        int             i, j, inc, sz;
        int             start, end;
        pg_t            *tmp;
        pg_t            **h = (pg_t **)hier;

        /*
         * First sort by number of CPUs
         */
        inc = size / 2;
        while (inc > 0) {
                for (i = inc; i < size; i++) {
                        j = i;
                        tmp = h[i];
                        while ((j >= inc) &&
                            (PG_NUM_CPUS(h[j - inc]) > PG_NUM_CPUS(tmp))) {
                                h[j] = h[j - inc];
                                j = j - inc;
                        }
                        h[j] = tmp;
                }
                if (inc == 2)
                        inc = 1;
                else
                        inc = (inc * 5) / 11;
        }

        /*
         * Break ties by asking the platform.
         * Determine if h[i] outranks h[i + 1] and if so, swap them.
         */
        for (start = 0; start < size; start++) {

                /*
                 * Find various contiguous sets of elements,
                 * in the array, with the same number of cpus
                 */
                end = start;
                sz = PG_NUM_CPUS(h[start]);
                while ((end < size) && (sz == PG_NUM_CPUS(h[end])))
                        end++;
                /*
                 * Sort each such set of the array by rank
                 */
                for (i = start + 1; i < end; i++) {
                        j = i - 1;
                        tmp = h[i];
                        while (j >= start &&
                            pg_cmt_hier_rank(hier[j],
                            (pg_cmt_t *)tmp) == hier[j]) {
                                h[j + 1] = h[j];
                                j--;
                        }
                        h[j + 1] = tmp;
                }
        }
}

/*
 * Return a cmt_lgrp_t * given an lgroup handle.
 */
static cmt_lgrp_t *
pg_cmt_find_lgrp(lgrp_handle_t hand)
{
        cmt_lgrp_t      *lgrp;

        ASSERT(MUTEX_HELD(&cpu_lock));

        lgrp = cmt_lgrps;
        while (lgrp != NULL) {
                if (lgrp->cl_hand == hand)
                        break;
                lgrp = lgrp->cl_next;
        }
        return (lgrp);
}

/*
 * Create a cmt_lgrp_t with the specified handle.
 */
static cmt_lgrp_t *
pg_cmt_lgrp_create(lgrp_handle_t hand)
{
        cmt_lgrp_t      *lgrp;

        ASSERT(MUTEX_HELD(&cpu_lock));

        lgrp = kmem_zalloc(sizeof (cmt_lgrp_t), KM_SLEEP);

        lgrp->cl_hand = hand;
        lgrp->cl_npgs = 0;
        lgrp->cl_next = cmt_lgrps;
        cmt_lgrps = lgrp;
        group_create(&lgrp->cl_pgs);

        return (lgrp);
}

/*
 * Interfaces to enable and disable power aware dispatching
 * The caller must be holding cpu_lock.
 *
 * Return 0 on success and -1 on failure.
 */
int
cmt_pad_enable(pghw_type_t type)
{
        group_t         *hwset;
        group_iter_t    iter;
        pg_cmt_t        *pg;

        ASSERT(PGHW_IS_PM_DOMAIN(type));
        ASSERT(MUTEX_HELD(&cpu_lock));

        if (cmt_sched_disabled == 1)
                return (-1);

        if ((hwset = pghw_set_lookup(type)) == NULL ||
            cmt_hw_blacklisted[type]) {
                /*
                 * Unable to find any instances of the specified type
                 * of power domain, or the power domains have been blacklisted.
                 */
                return (-1);
        }

        /*
         * Iterate over the power domains, setting the default dispatcher
         * policy for power/performance optimization.
         *
         * Simply setting the policy isn't enough in the case where the power
         * domain is an only child of another PG. Because the dispatcher walks
         * the PG hierarchy in a top down fashion, the higher up PG's policy
         * will dominate. So promote the power domain above it's parent if both
         * PG and it's parent have the same CPUs to ensure it's policy
         * dominates.
         */
        group_iter_init(&iter);
        while ((pg = group_iterate(hwset, &iter)) != NULL) {
                /*
                 * If the power domain is an only child to a parent
                 * not implementing the same policy, promote the child
                 * above the parent to activate the policy.
                 */
                pg->cmt_policy = pg_cmt_policy(((pghw_t *)pg)->pghw_hw);
                while ((pg->cmt_parent != NULL) &&
                    (pg->cmt_parent->cmt_policy != pg->cmt_policy) &&
                    (PG_NUM_CPUS((pg_t *)pg) ==
                    PG_NUM_CPUS((pg_t *)pg->cmt_parent))) {
                        cmt_hier_promote(pg, NULL);
                }
        }

        return (0);
}

int
cmt_pad_disable(pghw_type_t type)
{
        group_t         *hwset;
        group_iter_t    iter;
        pg_cmt_t        *pg;
        pg_cmt_t        *child;

        ASSERT(PGHW_IS_PM_DOMAIN(type));
        ASSERT(MUTEX_HELD(&cpu_lock));

        if (cmt_sched_disabled == 1)
                return (-1);

        if ((hwset = pghw_set_lookup(type)) == NULL) {
                /*
                 * Unable to find any instances of the specified type of
                 * power domain.
                 */
                return (-1);
        }
        /*
         * Iterate over the power domains, setting the default dispatcher
         * policy for performance optimization (load balancing).
         */
        group_iter_init(&iter);
        while ((pg = group_iterate(hwset, &iter)) != NULL) {

                /*
                 * If the power domain has an only child that implements
                 * policy other than load balancing, promote the child
                 * above the power domain to ensure it's policy dominates.
                 */
                if (pg->cmt_children != NULL &&
                    GROUP_SIZE(pg->cmt_children) == 1) {
                        child = GROUP_ACCESS(pg->cmt_children, 0);
                        if ((child->cmt_policy & CMT_BALANCE) == 0) {
                                cmt_hier_promote(child, NULL);
                        }
                }
                pg->cmt_policy = CMT_BALANCE;
        }
        return (0);
}

/* ARGSUSED */
static void
cmt_ev_thread_swtch(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
    kthread_t *new)
{
        pg_cmt_t        *cmt_pg = (pg_cmt_t *)pg;

        if (old == cp->cpu_idle_thread) {
                atomic_inc_32(&cmt_pg->cmt_utilization);
        } else if (new == cp->cpu_idle_thread) {
                atomic_dec_32(&cmt_pg->cmt_utilization);
        }
}

/*
 * Macro to test whether a thread is currently runnable on a CPU in a PG.
 */
#define THREAD_RUNNABLE_IN_PG(t, pg)                                    \
        ((t)->t_state == TS_RUN &&                                      \
            (t)->t_disp_queue->disp_cpu &&                              \
            bitset_in_set(&(pg)->cmt_cpus_actv_set,                     \
            (t)->t_disp_queue->disp_cpu->cpu_seqid))

static void
cmt_ev_thread_swtch_pwr(pg_t *pg, cpu_t *cp, hrtime_t now, kthread_t *old,
    kthread_t *new)
{
        pg_cmt_t        *cmt = (pg_cmt_t *)pg;
        cpupm_domain_t  *dom;
        uint32_t        u;

        if (old == cp->cpu_idle_thread) {
                ASSERT(new != cp->cpu_idle_thread);
                u = atomic_inc_32_nv(&cmt->cmt_utilization);
                if (u == 1) {
                        /*
                         * Notify the CPU power manager that the domain
                         * is non-idle.
                         */
                        dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
                        cpupm_utilization_event(cp, now, dom,
                            CPUPM_DOM_BUSY_FROM_IDLE);
                }
        } else if (new == cp->cpu_idle_thread) {
                ASSERT(old != cp->cpu_idle_thread);
                u = atomic_dec_32_nv(&cmt->cmt_utilization);
                if (u == 0) {
                        /*
                         * The domain is idle, notify the CPU power
                         * manager.
                         *
                         * Avoid notifying if the thread is simply migrating
                         * between CPUs in the domain.
                         */
                        if (!THREAD_RUNNABLE_IN_PG(old, cmt)) {
                                dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
                                cpupm_utilization_event(cp, now, dom,
                                    CPUPM_DOM_IDLE_FROM_BUSY);
                        }
                }
        }
}

/* ARGSUSED */
static void
cmt_ev_thread_remain_pwr(pg_t *pg, cpu_t *cp, kthread_t *t)
{
        pg_cmt_t        *cmt = (pg_cmt_t *)pg;
        cpupm_domain_t  *dom;

        dom = (cpupm_domain_t *)cmt->cmt_pg.pghw_handle;
        cpupm_utilization_event(cp, (hrtime_t)0, dom, CPUPM_DOM_REMAIN_BUSY);
}

/*
 * Return the name of the CMT scheduling policy
 * being implemented across this PG
 */
static char *
pg_cmt_policy_name(pg_t *pg)
{
        pg_cmt_policy_t policy;

        policy = ((pg_cmt_t *)pg)->cmt_policy;

        if (policy & CMT_AFFINITY) {
                if (policy & CMT_BALANCE)
                        return ("Load Balancing & Affinity");
                else if (policy & CMT_COALESCE)
                        return ("Load Coalescence & Affinity");
                else
                        return ("Affinity");
        } else {
                if (policy & CMT_BALANCE)
                        return ("Load Balancing");
                else if (policy & CMT_COALESCE)
                        return ("Load Coalescence");
                else
                        return ("None");
        }
}

/*
 * Prune PG, and all other instances of PG's hardware sharing relationship
 * from the CMT PG hierarchy.
 *
 * This routine operates on the CPU specific processor group data (for the CPUs
 * in the PG being pruned), and may be invoked from a context where one CPU's
 * PG data is under construction. In this case the argument "pgdata", if not
 * NULL, is a reference to the CPU's under-construction PG data.
 */
static int
pg_cmt_prune(pg_cmt_t *pg_bad, pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
{
        group_t         *hwset, *children;
        int             i, j, r, size = *sz;
        group_iter_t    hw_iter, child_iter;
        pg_cpu_itr_t    cpu_iter;
        pg_cmt_t        *pg, *child;
        cpu_t           *cpu;
        int             cap_needed;
        pghw_type_t     hw;

        ASSERT(MUTEX_HELD(&cpu_lock));

        /*
         * Inform pghw layer that this PG is pruned.
         */
        pghw_cmt_fini((pghw_t *)pg_bad);

        hw = ((pghw_t *)pg_bad)->pghw_hw;

        if (hw == PGHW_POW_ACTIVE) {
                cmn_err(CE_NOTE, "!Active CPUPM domain groups look suspect. "
                    "Event Based CPUPM Unavailable");
        } else if (hw == PGHW_POW_IDLE) {
                cmn_err(CE_NOTE, "!Idle CPUPM domain groups look suspect. "
                    "Dispatcher assisted CPUPM disabled.");
        }

        /*
         * Find and eliminate the PG from the lineage.
         */
        for (i = 0; i < size; i++) {
                if (lineage[i] == pg_bad) {
                        for (j = i; j < size - 1; j++)
                                lineage[j] = lineage[j + 1];
                        *sz = size - 1;
                        break;
                }
        }

        /*
         * We'll prune all instances of the hardware sharing relationship
         * represented by pg. But before we do that (and pause CPUs) we need
         * to ensure the hierarchy's groups are properly sized.
         */
        hwset = pghw_set_lookup(hw);

        /*
         * Blacklist the hardware so future processor groups of this type won't
         * participate in CMT thread placement.
         *
         * XXX
         * For heterogeneous system configurations, this might be overkill.
         * We may only need to blacklist the illegal PGs, and other instances
         * of this hardware sharing relationship may be ok.
         */
        cmt_hw_blacklisted[hw] = 1;

        /*
         * For each of the PGs being pruned, ensure sufficient capacity in
         * the siblings set for the PG's children
         */
        group_iter_init(&hw_iter);
        while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {
                /*
                 * PG is being pruned, but if it is bringing up more than
                 * one child, ask for more capacity in the siblings group.
                 */
                cap_needed = 0;
                if (pg->cmt_children &&
                    GROUP_SIZE(pg->cmt_children) > 1) {
                        cap_needed = GROUP_SIZE(pg->cmt_children) - 1;

                        group_expand(pg->cmt_siblings,
                            GROUP_SIZE(pg->cmt_siblings) + cap_needed);

                        /*
                         * If this is a top level group, also ensure the
                         * capacity in the root lgrp level CMT grouping.
                         */
                        if (pg->cmt_parent == NULL &&
                            pg->cmt_siblings != &cmt_root->cl_pgs) {
                                group_expand(&cmt_root->cl_pgs,
                                    GROUP_SIZE(&cmt_root->cl_pgs) + cap_needed);
                                cmt_root->cl_npgs += cap_needed;
                        }
                }
        }

        /*
         * We're operating on the PG hierarchy. Pause CPUs to ensure
         * exclusivity with respect to the dispatcher.
         */
        pause_cpus(NULL, NULL);

        /*
         * Prune all PG instances of the hardware sharing relationship
         * represented by pg.
         */
        group_iter_init(&hw_iter);
        while ((pg = group_iterate(hwset, &hw_iter)) != NULL) {

                /*
                 * Remove PG from it's group of siblings, if it's there.
                 */
                if (pg->cmt_siblings) {
                        (void) group_remove(pg->cmt_siblings, pg, GRP_NORESIZE);
                }
                if (pg->cmt_parent == NULL &&
                    pg->cmt_siblings != &cmt_root->cl_pgs) {
                        (void) group_remove(&cmt_root->cl_pgs, pg,
                            GRP_NORESIZE);
                }

                /*
                 * Indicate that no CMT policy will be implemented across
                 * this PG.
                 */
                pg->cmt_policy = CMT_NO_POLICY;

                /*
                 * Move PG's children from it's children set to it's parent's
                 * children set. Note that the parent's children set, and PG's
                 * siblings set are the same thing.
                 *
                 * Because we are iterating over the same group that we are
                 * operating on (removing the children), first add all of PG's
                 * children to the parent's children set, and once we are done
                 * iterating, empty PG's children set.
                 */
                if (pg->cmt_children != NULL) {
                        children = pg->cmt_children;

                        group_iter_init(&child_iter);
                        while ((child = group_iterate(children, &child_iter))
                            != NULL) {
                                if (pg->cmt_siblings != NULL) {
                                        r = group_add(pg->cmt_siblings, child,
                                            GRP_NORESIZE);
                                        ASSERT(r == 0);

                                        if (pg->cmt_parent == NULL &&
                                            pg->cmt_siblings !=
                                            &cmt_root->cl_pgs) {
                                                r = group_add(&cmt_root->cl_pgs,
                                                    child, GRP_NORESIZE);
                                                ASSERT(r == 0);
                                        }
                                }
                        }
                        group_empty(pg->cmt_children);
                }

                /*
                 * Reset the callbacks to the defaults
                 */
                pg_callback_set_defaults((pg_t *)pg);

                /*
                 * Update all the CPU lineages in each of PG's CPUs
                 */
                PG_CPU_ITR_INIT(pg, cpu_iter);
                while ((cpu = pg_cpu_next(&cpu_iter)) != NULL) {
                        pg_cmt_t        *cpu_pg;
                        group_iter_t    liter;  /* Iterator for the lineage */
                        cpu_pg_t        *cpd;   /* CPU's PG data */

                        /*
                         * The CPU's lineage is under construction still
                         * references the bootstrap CPU PG data structure.
                         */
                        if (pg_cpu_is_bootstrapped(cpu))
                                cpd = pgdata;
                        else
                                cpd = cpu->cpu_pg;

                        /*
                         * Iterate over the CPU's PGs updating the children
                         * of the PG being promoted, since they have a new
                         * parent and siblings set.
                         */
                        group_iter_init(&liter);
                        while ((cpu_pg = group_iterate(&cpd->pgs,
                            &liter)) != NULL) {
                                if (cpu_pg->cmt_parent == pg) {
                                        cpu_pg->cmt_parent = pg->cmt_parent;
                                        cpu_pg->cmt_siblings = pg->cmt_siblings;
                                }
                        }

                        /*
                         * Update the CPU's lineages
                         *
                         * Remove the PG from the CPU's group used for CMT
                         * scheduling.
                         */
                        (void) group_remove(&cpd->cmt_pgs, pg, GRP_NORESIZE);
                }
        }
        start_cpus();
        return (0);
}

/*
 * Disable CMT scheduling
 */
static void
pg_cmt_disable(void)
{
        cpu_t           *cpu;

        ASSERT(MUTEX_HELD(&cpu_lock));

        pause_cpus(NULL, NULL);
        cpu = cpu_list;

        do {
                if (cpu->cpu_pg)
                        group_empty(&cpu->cpu_pg->cmt_pgs);
        } while ((cpu = cpu->cpu_next) != cpu_list);

        cmt_sched_disabled = 1;
        start_cpus();
        cmn_err(CE_NOTE, "!CMT thread placement optimizations unavailable");
}

/*
 * CMT lineage validation
 *
 * This routine is invoked by pg_cmt_cpu_init() to validate the integrity
 * of the PGs in a CPU's lineage. This is necessary because it's possible that
 * some groupings (power domain groupings in particular) may be defined by
 * sources that are buggy (e.g. BIOS bugs). In such cases, it may not be
 * possible to integrate those groupings into the CMT PG hierarchy, if doing
 * so would violate the subset invariant of the hierarchy, which says that
 * a PG must be subset of its parent (if it has one).
 *
 * pg_cmt_lineage_validate()'s purpose is to detect grouping definitions that
 * would result in a violation of this invariant. If a violation is found,
 * and the PG is of a grouping type who's definition is known to originate from
 * suspect sources (BIOS), then pg_cmt_prune() will be invoked to prune the
 * PG (and all other instances PG's sharing relationship type) from the CMT
 * hierarchy. Further, future instances of that sharing relationship type won't
 * be added. If the grouping definition doesn't originate from suspect
 * sources, then pg_cmt_disable() will be invoked to log an error, and disable
 * CMT scheduling altogether.
 *
 * This routine is invoked after the CPU has been added to the PGs in which
 * it belongs, but before those PGs have been added to (or had their place
 * adjusted in) the CMT PG hierarchy.
 *
 * The first argument is the CPUs PG lineage (essentially an array of PGs in
 * which the CPU belongs) that has already been sorted in ascending order
 * by CPU count. Some of the PGs in the CPUs lineage may already have other
 * CPUs in them, and have already been integrated into the CMT hierarchy.
 *
 * The addition of this new CPU to these pre-existing PGs means that those
 * PGs may need to be promoted up in the hierarchy to satisfy the subset
 * invariant. In additon to testing the subset invariant for the lineage,
 * this routine also verifies that the addition of the new CPU to the
 * existing PGs wouldn't cause the subset invariant to be violated in
 * the exiting lineages.
 *
 * This routine will normally return one of the following:
 * CMT_LINEAGE_VALID - There were no problems detected with the lineage.
 * CMT_LINEAGE_REPAIRED - Problems were detected, but repaired via pruning.
 *
 * Otherwise, this routine will return a value indicating which error it
 * was unable to recover from (and set cmt_lineage_status along the way).
 *
 * This routine operates on the CPU specific processor group data (for the CPU
 * whose lineage is being validated), which is under-construction.
 * "pgdata" is a reference to the CPU's under-construction PG data.
 * This routine must be careful to operate only on "pgdata", and not cp->cpu_pg.
 */
static cmt_lineage_validation_t
pg_cmt_lineage_validate(pg_cmt_t **lineage, int *sz, cpu_pg_t *pgdata)
{
        int             i, j, size;
        pg_cmt_t        *pg, *pg_next, *pg_bad, *pg_tmp, *parent;
        cpu_t           *cp;
        pg_cpu_itr_t    cpu_iter;
        lgrp_handle_t   lgrp;

        ASSERT(MUTEX_HELD(&cpu_lock));
        pg = NULL;
        pg_next = NULL;

revalidate:
        size = *sz;
        pg_bad = NULL;
        lgrp = LGRP_NULL_HANDLE;
        for (i = 0; i < size; i++) {

                pg = lineage[i];
                if (i < size - 1)
                        pg_next = lineage[i + 1];
                else
                        pg_next = NULL;

                /*
                 * We assume that the lineage has already been sorted
                 * by the number of CPUs. In fact, we depend on it.
                 */
                ASSERT(pg_next == NULL ||
                    (PG_NUM_CPUS((pg_t *)pg) <= PG_NUM_CPUS((pg_t *)pg_next)));

                /*
                 * The CPUs PG lineage was passed as the first argument to
                 * this routine and contains the sorted list of the CPU's
                 * PGs. Ultimately, the ordering of the PGs in that list, and
                 * the ordering as traversed by the cmt_parent list must be
                 * the same. PG promotion will be used as the mechanism to
                 * achieve this, but first we need to look for cases where
                 * promotion will be necessary, and validate that will be
                 * possible without violating the subset invarient described
                 * above.
                 *
                 * Since the PG topology is in the middle of being changed, we
                 * need to check whether the PG's existing parent (if any) is
                 * part of this CPU's lineage (and therefore should contain
                 * the new CPU). If not, it means that the addition of the
                 * new CPU should have made this PG have more CPUs than its
                 * parent (and other ancestors not in the same lineage) and
                 * will need to be promoted into place.
                 *
                 * We need to verify all of this to defend against a buggy
                 * BIOS giving bad power domain CPU groupings. Sigh.
                 */
                parent = pg->cmt_parent;
                while (parent != NULL) {
                        /*
                         * Determine if the parent/ancestor is in this lineage
                         */
                        pg_tmp = NULL;
                        for (j = 0; (j < size) && (pg_tmp != parent); j++) {
                                pg_tmp = lineage[j];
                        }
                        if (pg_tmp == parent) {
                                /*
                                 * It's in the lineage. The concentricity
                                 * checks will handle the rest.
                                 */
                                break;
                        }
                        /*
                         * If it is not in the lineage, PG will eventually
                         * need to be promoted above it. Verify the ancestor
                         * is a proper subset. There is still an error if
                         * the ancestor has the same number of CPUs as PG,
                         * since that would imply it should be in the lineage,
                         * and we already know it isn't.
                         */
                        if (PG_NUM_CPUS((pg_t *)parent) >=
                            PG_NUM_CPUS((pg_t *)pg)) {
                                /*
                                 * Not a proper subset if the parent/ancestor
                                 * has the same or more CPUs than PG.
                                 */
                                cmt_lineage_status = CMT_LINEAGE_NON_PROMOTABLE;
                                goto handle_error;
                        }
                        parent = parent->cmt_parent;
                }

                /*
                 * Walk each of the CPUs in the PGs group and perform
                 * consistency checks along the way.
                 */
                PG_CPU_ITR_INIT((pg_t *)pg, cpu_iter);
                while ((cp = pg_cpu_next(&cpu_iter)) != NULL) {
                        /*
                         * Verify that there aren't any CPUs contained in PG
                         * that the next PG in the lineage (which is larger
                         * or same size) doesn't also contain.
                         */
                        if (pg_next != NULL &&
                            pg_cpu_find((pg_t *)pg_next, cp) == B_FALSE) {
                                cmt_lineage_status = CMT_LINEAGE_NON_CONCENTRIC;
                                goto handle_error;
                        }

                        /*
                         * Verify that all the CPUs in the PG are in the same
                         * lgroup.
                         */
                        if (lgrp == LGRP_NULL_HANDLE) {
                                lgrp = lgrp_plat_cpu_to_hand(cp->cpu_id);
                        } else if (lgrp_plat_cpu_to_hand(cp->cpu_id) != lgrp) {
                                cmt_lineage_status = CMT_LINEAGE_PG_SPANS_LGRPS;
                                goto handle_error;
                        }
                }
        }

handle_error:
        /*
         * Some of these validation errors can result when the CPU grouping
         * information is derived from buggy sources (for example, incorrect
         * ACPI tables on x86 systems).
         *
         * We'll try to recover in such cases by pruning out the illegal
         * groupings from the PG hierarchy, which means that we won't optimize
         * for those levels, but we will for the remaining ones.
         */
        switch (cmt_lineage_status) {
        case CMT_LINEAGE_VALID:
        case CMT_LINEAGE_REPAIRED:
                break;
        case CMT_LINEAGE_PG_SPANS_LGRPS:
                /*
                 * We've detected a PG whose CPUs span lgroups.
                 *
                 * This isn't supported, as the dispatcher isn't allowed to
                 * to do CMT thread placement across lgroups, as this would
                 * conflict with policies implementing MPO thread affinity.
                 *
                 * If the PG is of a sharing relationship type known to
                 * legitimately span lgroups, specify that no CMT thread
                 * placement policy should be implemented, and prune the PG
                 * from the existing CMT PG hierarchy.
                 *
                 * Otherwise, fall though to the case below for handling.
                 */
                if (((pghw_t *)pg)->pghw_hw == PGHW_CHIP) {
                        if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
                                cmt_lineage_status = CMT_LINEAGE_REPAIRED;
                                goto revalidate;
                        }
                }
                /* FALLTHROUGH */
        case CMT_LINEAGE_NON_PROMOTABLE:
                /*
                 * We've detected a PG that already exists in another CPU's
                 * lineage that cannot cannot legally be promoted into place
                 * without breaking the invariants of the hierarchy.
                 */
                if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
                        if (pg_cmt_prune(pg, lineage, sz, pgdata) == 0) {
                                cmt_lineage_status = CMT_LINEAGE_REPAIRED;
                                goto revalidate;
                        }
                }
                /*
                 * Something went wrong trying to prune out the bad level.
                 * Disable CMT scheduling altogether.
                 */
                pg_cmt_disable();
                break;
        case CMT_LINEAGE_NON_CONCENTRIC:
                /*
                 * We've detected a non-concentric PG lineage, which means that
                 * there's a PG in the lineage that has CPUs that the next PG
                 * over in the lineage (which is the same size or larger)
                 * doesn't have.
                 *
                 * In this case, we examine the two PGs to see if either
                 * grouping is defined by potentially buggy sources.
                 *
                 * If one has less CPUs than the other, and contains CPUs
                 * not found in the parent, and it is an untrusted enumeration,
                 * then prune it. If both have the same number of CPUs, then
                 * prune the one that is untrusted.
                 *
                 * This process repeats until we have a concentric lineage,
                 * or we would have to prune out level derived from what we
                 * thought was a reliable source, in which case CMT scheduling
                 * is disabled altogether.
                 */
                if ((PG_NUM_CPUS((pg_t *)pg) < PG_NUM_CPUS((pg_t *)pg_next)) &&
                    (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw))) {
                        pg_bad = pg;
                } else if (PG_NUM_CPUS((pg_t *)pg) ==
                    PG_NUM_CPUS((pg_t *)pg_next)) {
                        if (PG_CMT_HW_SUSPECT(((pghw_t *)pg_next)->pghw_hw)) {
                                pg_bad = pg_next;
                        } else if (PG_CMT_HW_SUSPECT(((pghw_t *)pg)->pghw_hw)) {
                                pg_bad = pg;
                        }
                }
                if (pg_bad) {
                        if (pg_cmt_prune(pg_bad, lineage, sz, pgdata) == 0) {
                                cmt_lineage_status = CMT_LINEAGE_REPAIRED;
                                goto revalidate;
                        }
                }
                /*
                 * Something went wrong trying to identify and/or prune out
                 * the bad level. Disable CMT scheduling altogether.
                 */
                pg_cmt_disable();
                break;
        default:
                /*
                 * If we're here, we've encountered a validation error for
                 * which we don't know how to recover. In this case, disable
                 * CMT scheduling altogether.
                 */
                cmt_lineage_status = CMT_LINEAGE_UNRECOVERABLE;
                pg_cmt_disable();
        }
        return (cmt_lineage_status);
}