usr/src/uts/common/os/lgrp.c

root/usr/src/uts/common/os/lgrp.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2019 Joyent, Inc.
 */

/*
 * Basic NUMA support in terms of locality groups
 *
 * Solaris needs to know which CPUs, memory, etc. are near each other to
 * provide good performance on NUMA machines by optimizing for locality.
 * In order to do this, a new abstraction called a "locality group (lgroup)"
 * has been introduced to keep track of which CPU-like and memory-like hardware
 * resources are close to each other.  Currently, latency is the only measure
 * used to determine how to group hardware resources into lgroups, but this
 * does not limit the groupings to be based solely on latency.  Other factors
 * may be used to determine the groupings in the future.
 *
 * Lgroups are organized into a hieararchy or topology that represents the
 * latency topology of the machine.  There is always at least a root lgroup in
 * the system.  It represents all the hardware resources in the machine at a
 * latency big enough that any hardware resource can at least access any other
 * hardware resource within that latency.  A Uniform Memory Access (UMA)
 * machine is represented with one lgroup (the root).  In contrast, a NUMA
 * machine is represented at least by the root lgroup and some number of leaf
 * lgroups where the leaf lgroups contain the hardware resources within the
 * least latency of each other and the root lgroup still contains all the
 * resources in the machine.  Some number of intermediate lgroups may exist
 * which represent more levels of locality than just the local latency of the
 * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
 * (eg. root and intermediate lgroups) contain the next nearest resources to
 * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
 * to the root lgroup shows the hardware resources from closest to farthest
 * from the leaf lgroup such that each successive ancestor lgroup contains
 * the next nearest resources at the next level of locality from the previous.
 *
 * The kernel uses the lgroup abstraction to know how to allocate resources
 * near a given process/thread.  At fork() and lwp/thread_create() time, a
 * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
 * with the lowest load average.  Binding to a processor or processor set will
 * change the home lgroup for a thread.  The scheduler has been modified to try
 * to dispatch a thread on a CPU in its home lgroup.  Physical memory
 * allocation is lgroup aware too, so memory will be allocated from the current
 * thread's home lgroup if possible.  If the desired resources are not
 * available, the kernel traverses the lgroup hierarchy going to the parent
 * lgroup to find resources at the next level of locality until it reaches the
 * root lgroup.
 */

#include <sys/lgrp.h>
#include <sys/lgrp_user.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/param.h>
#include <sys/var.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/kmem.h>
#include <vm/seg.h>
#include <vm/seg_kmem.h>
#include <vm/seg_spt.h>
#include <vm/seg_vn.h>
#include <vm/as.h>
#include <sys/atomic.h>
#include <sys/systm.h>
#include <sys/errno.h>
#include <sys/cmn_err.h>
#include <sys/kstat.h>
#include <sys/sysmacros.h>
#include <sys/pg.h>
#include <sys/promif.h>
#include <sys/sdt.h>
#include <sys/smt.h>

lgrp_gen_t      lgrp_gen = 0;           /* generation of lgroup hierarchy */
lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
                                /* indexed by lgrp_id */
int     nlgrps;                 /* number of lgroups in machine */
int     lgrp_alloc_hint = -1;   /* hint for where to try to allocate next */
int     lgrp_alloc_max = 0;     /* max lgroup ID allocated so far */

/*
 * Kstat data for lgroups.
 *
 * Actual kstat data is collected in lgrp_stats array.
 * The lgrp_kstat_data array of named kstats is used to extract data from
 * lgrp_stats and present it to kstat framework. It is protected from partallel
 * modifications by lgrp_kstat_mutex. This may cause some contention when
 * several kstat commands run in parallel but this is not the
 * performance-critical path.
 */
extern struct lgrp_stats lgrp_stats[];  /* table of per-lgrp stats */

/*
 * Declare kstat names statically for enums as defined in the header file.
 */
LGRP_KSTAT_NAMES;

static void     lgrp_kstat_init(void);
static int      lgrp_kstat_extract(kstat_t *, int);
static void     lgrp_kstat_reset(lgrp_id_t);

static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
static kmutex_t lgrp_kstat_mutex;


/*
 * max number of lgroups supported by the platform
 */
int     nlgrpsmax = 0;

/*
 * The root lgroup. Represents the set of resources at the system wide
 * level of locality.
 */
lgrp_t          *lgrp_root = NULL;

/*
 * During system bootstrap cp_default does not contain the list of lgrp load
 * averages (cp_lgrploads). The list is allocated after the first CPU is brought
 * on-line when cp_default is initialized by cpupart_initialize_default().
 * Configuring CPU0 may create a two-level topology with root and one leaf node
 * containing CPU0. This topology is initially constructed in a special
 * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
 * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
 * for all lpl operations until cp_default is fully constructed.
 *
 * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
 * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
 * the first element of lpl_bootstrap_list.
 *
 * CPUs that are added to the system, but have not yet been assigned to an
 * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
 * on some architectures (x86) it's possible for the slave CPU startup thread
 * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
 */
#define LPL_BOOTSTRAP_SIZE 2
static lpl_t    lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
lpl_t           *lpl_bootstrap;
static lpl_t    *lpl_bootstrap_rset[LPL_BOOTSTRAP_SIZE];
static int      lpl_bootstrap_id2rset[LPL_BOOTSTRAP_SIZE];

/*
 * If cp still references the bootstrap lpl, it has not yet been added to
 * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
 * a thread is trying to allocate memory close to a CPU that has no lgrp.
 */
#define LGRP_CPU_HAS_NO_LGRP(cp)        ((cp)->cpu_lpl == lpl_bootstrap)

static lgrp_t   lroot;

/*
 * Size, in bytes, beyond which random memory allocation policy is applied
 * to non-shared memory.  Default is the maximum size, so random memory
 * allocation won't be used for non-shared memory by default.
 */
size_t  lgrp_privm_random_thresh = (size_t)(-1);

/* the maximum effect that a single thread can have on it's lgroup's load */
#define LGRP_LOADAVG_MAX_EFFECT(ncpu) \
        ((lgrp_loadavg_max_effect) / (ncpu))
uint32_t        lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;


/*
 * Size, in bytes, beyond which random memory allocation policy is applied to
 * shared memory.  Default is 8MB (2 ISM pages).
 */
size_t  lgrp_shm_random_thresh = 8*1024*1024;

/*
 * Whether to do processor set aware memory allocation by default
 */
int     lgrp_mem_pset_aware = 0;

/*
 * Set the default memory allocation policy for root lgroup
 */
lgrp_mem_policy_t       lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;

/*
 * Set the default memory allocation policy.  For most platforms,
 * next touch is sufficient, but some platforms may wish to override
 * this.
 */
lgrp_mem_policy_t       lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;


/*
 * lgroup CPU event handlers
 */
static void     lgrp_cpu_init(struct cpu *);
static void     lgrp_cpu_fini(struct cpu *, lgrp_id_t);
static lgrp_t   *lgrp_cpu_to_lgrp(struct cpu *);

/*
 * lgroup memory event handlers
 */
static void     lgrp_mem_init(int, lgrp_handle_t, boolean_t);
static void     lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
static void     lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);

/*
 * lgroup CPU partition event handlers
 */
static void     lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
static void     lgrp_part_del_cpu(struct cpu *);

/*
 * lgroup framework initialization
 */
static void     lgrp_main_init(void);
static void     lgrp_main_mp_init(void);
static void     lgrp_root_init(void);
static void     lgrp_setup(void);

/*
 * lpl topology
 */
static void     lpl_init(lpl_t *, lpl_t *, lgrp_t *);
static void     lpl_clear(lpl_t *);
static void     lpl_leaf_insert(lpl_t *, struct cpupart *);
static void     lpl_leaf_remove(lpl_t *, struct cpupart *);
static void     lpl_rset_add(lpl_t *, lpl_t *);
static void     lpl_rset_del(lpl_t *, lpl_t *);
static int      lpl_rset_contains(lpl_t *, lpl_t *);
static void     lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
static void     lpl_child_update(lpl_t *, struct cpupart *);
static int      lpl_pick(lpl_t *, lpl_t *);
static void     lpl_verify_wrapper(struct cpupart *);

/*
 * defines for lpl topology verifier return codes
 */

#define LPL_TOPO_CORRECT                        0
#define LPL_TOPO_PART_HAS_NO_LPL                -1
#define LPL_TOPO_CPUS_NOT_EMPTY                 -2
#define LPL_TOPO_LGRP_MISMATCH                  -3
#define LPL_TOPO_MISSING_PARENT                 -4
#define LPL_TOPO_PARENT_MISMATCH                -5
#define LPL_TOPO_BAD_CPUCNT                     -6
#define LPL_TOPO_RSET_MISMATCH                  -7
#define LPL_TOPO_LPL_ORPHANED                   -8
#define LPL_TOPO_LPL_BAD_NCPU                   -9
#define LPL_TOPO_RSET_MSSNG_LF                  -10
#define LPL_TOPO_CPU_HAS_BAD_LPL                -11
#define LPL_TOPO_NONLEAF_HAS_CPUS               -12
#define LPL_TOPO_LGRP_NOT_LEAF                  -13
#define LPL_TOPO_BAD_RSETCNT                    -14

/*
 * Return whether lgroup optimizations should be enabled on this system
 */
int
lgrp_optimizations(void)
{
        /*
         * System must have more than 2 lgroups to enable lgroup optimizations
         *
         * XXX This assumes that a 2 lgroup system has an empty root lgroup
         * with one child lgroup containing all the resources. A 2 lgroup
         * system with a root lgroup directly containing CPUs or memory might
         * need lgroup optimizations with its child lgroup, but there
         * isn't such a machine for now....
         */
        if (nlgrps > 2)
                return (1);

        return (0);
}

/*
 * Setup root lgroup
 */
static void
lgrp_root_init(void)
{
        lgrp_handle_t   hand;
        int             i;
        lgrp_id_t       id;

        /*
         * Create the "root" lgroup
         */
        ASSERT(nlgrps == 0);
        id = nlgrps++;

        lgrp_root = &lroot;

        lgrp_root->lgrp_cpu = NULL;
        lgrp_root->lgrp_mnodes = 0;
        lgrp_root->lgrp_nmnodes = 0;
        hand = lgrp_plat_root_hand();
        lgrp_root->lgrp_plathand = hand;

        lgrp_root->lgrp_id = id;
        lgrp_root->lgrp_cpucnt = 0;
        lgrp_root->lgrp_childcnt = 0;
        klgrpset_clear(lgrp_root->lgrp_children);
        klgrpset_clear(lgrp_root->lgrp_leaves);
        lgrp_root->lgrp_parent = NULL;
        lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);

        for (i = 0; i < LGRP_RSRC_COUNT; i++)
                klgrpset_clear(lgrp_root->lgrp_set[i]);

        lgrp_root->lgrp_kstat = NULL;

        lgrp_table[id] = lgrp_root;

        /*
         * Setup initial lpl list for CPU0 and initial t0 home.
         * The only lpl space we have so far is lpl_bootstrap. It is used for
         * all topology operations until cp_default is initialized at which
         * point t0.t_lpl will be updated.
         */
        lpl_bootstrap = lpl_bootstrap_list;
        t0.t_lpl = lpl_bootstrap;
        cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
        lpl_bootstrap_list[1].lpl_lgrpid = 1;

        /*
         * Set up the bootstrap rset
         * Since the bootstrap toplogy has just the root, and a leaf,
         * the rset contains just the leaf, and both lpls can use the same rset
         */
        lpl_bootstrap_rset[0] = &lpl_bootstrap_list[1];
        lpl_bootstrap_list[0].lpl_rset_sz = 1;
        lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
        lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;

        lpl_bootstrap_list[1].lpl_rset_sz = 1;
        lpl_bootstrap_list[1].lpl_rset = lpl_bootstrap_rset;
        lpl_bootstrap_list[1].lpl_id2rset = lpl_bootstrap_id2rset;

        cp_default.cp_lgrploads = lpl_bootstrap;
}

/*
 * Initialize the lgroup framework and allow the platform to do the same
 *
 * This happens in stages during boot and is all funnelled through this routine
 * (see definition of lgrp_init_stages_t to see what happens at each stage and
 * when)
 */
void
lgrp_init(lgrp_init_stages_t stage)
{
        /*
         * Initialize the platform
         */
        lgrp_plat_init(stage);

        switch (stage) {
        case LGRP_INIT_STAGE1:
                /*
                 * Set max number of lgroups supported on this platform which
                 * must be less than the max number of lgroups supported by the
                 * common lgroup framework (eg. NLGRPS_MAX is max elements in
                 * lgrp_table[], etc.)
                 */
                nlgrpsmax = lgrp_plat_max_lgrps();
                ASSERT(nlgrpsmax <= NLGRPS_MAX);
                break;

        case LGRP_INIT_STAGE2:
                lgrp_setup();
                break;

        case LGRP_INIT_STAGE4:
                lgrp_main_init();
                break;

        case LGRP_INIT_STAGE5:
                lgrp_main_mp_init();
                break;

        default:
                break;
        }
}

/*
 * Create the root and cpu0's lgroup, and set t0's home.
 */
static void
lgrp_setup(void)
{
        /*
         * Setup the root lgroup
         */
        lgrp_root_init();

        /*
         * Add cpu0 to an lgroup
         */
        lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
        lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
}

/*
 * true when lgrp initialization has been completed.
 */
int     lgrp_initialized = 0;

/*
 * True when lgrp topology is constructed.
 */
int     lgrp_topo_initialized = 0;

/*
 * Init routine called after startup(), /etc/system has been processed,
 * and cpu0 has been added to an lgroup.
 */
static void
lgrp_main_init(void)
{
        cpu_t           *cp = CPU;
        lgrp_id_t       lgrpid;
        int             i;
        extern void     pg_cpu0_reinit();

        /*
         * Enforce a valid lgrp_mem_default_policy
         */
        if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
            (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES) ||
            (lgrp_mem_default_policy == LGRP_MEM_POLICY_NEXT_SEG))
                lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;

        /*
         * See if mpo should be disabled.
         * This may happen in the case of null proc LPA on Starcat.
         * The platform won't be able to detect null proc LPA until after
         * cpu0 and memory have already been added to lgroups.
         * When and if it is detected, the Starcat platform will return
         * a different platform handle for cpu0 which is what we check for
         * here. If mpo should be disabled move cpu0 to it's rightful place
         * (the root), and destroy the remaining lgroups. This effectively
         * provides an UMA lgroup topology.
         */
        lgrpid = cp->cpu_lpl->lpl_lgrpid;
        if (lgrp_table[lgrpid]->lgrp_plathand !=
            lgrp_plat_cpu_to_hand(cp->cpu_id)) {
                lgrp_part_del_cpu(cp);
                lgrp_cpu_fini(cp, lgrpid);

                lgrp_cpu_init(cp);
                lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);

                ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);

                /*
                 * Notify the PG subsystem that the CPU's lgrp
                 * association has changed
                 */
                pg_cpu0_reinit();

                /*
                 * Destroy all lgroups except for root
                 */
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        if (LGRP_EXISTS(lgrp_table[i]) &&
                            lgrp_table[i] != lgrp_root)
                                lgrp_destroy(lgrp_table[i]);
                }

                /*
                 * Fix up root to point at itself for leaves and resources
                 * and not have any children
                 */
                lgrp_root->lgrp_childcnt = 0;
                klgrpset_clear(lgrp_root->lgrp_children);
                klgrpset_clear(lgrp_root->lgrp_leaves);
                klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
                klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
                klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
        }

        /*
         * Initialize kstats framework.
         */
        lgrp_kstat_init();
        /*
         * cpu0 is finally where it should be, so create it's lgroup's kstats
         */
        mutex_enter(&cpu_lock);
        lgrp_kstat_create(cp);
        mutex_exit(&cpu_lock);

        lgrp_initialized = 1;
}

/*
 * Finish lgrp initialization after all CPUS are brought on-line.
 * This routine is called after start_other_cpus().
 */
static void
lgrp_main_mp_init(void)
{
        klgrpset_t changed;

        smt_init();

        /*
         * Update lgroup topology (if necessary)
         */
        klgrpset_clear(changed);
        (void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
        lgrp_topo_initialized = 1;
}

/*
 * Change latency of lgroup with specified lgroup platform handle (if one is
 * given) or change all lgroups with old latency to new latency
 */
void
lgrp_latency_change(lgrp_handle_t hand, u_longlong_t oldtime,
    u_longlong_t newtime)
{
        lgrp_t          *lgrp;
        int             i;

        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp = lgrp_table[i];

                if (!LGRP_EXISTS(lgrp))
                        continue;

                if ((hand == LGRP_NULL_HANDLE &&
                    lgrp->lgrp_latency == oldtime) ||
                    (hand != LGRP_NULL_HANDLE && lgrp->lgrp_plathand == hand))
                        lgrp->lgrp_latency = (int)newtime;
        }
}

/*
 * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
 */
void
lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
{
        klgrpset_t      changed;
        cpu_t           *cp;
        lgrp_id_t       id;
        int             rc;

        switch (event) {
        /*
         * The following (re)configuration events are common code
         * initiated. lgrp_plat_config() is called here to inform the
         * platform of the reconfiguration event.
         */
        case LGRP_CONFIG_CPU_ADD:
                cp = (cpu_t *)resource;

                /*
                 * Initialize the new CPU's lgrp related next/prev
                 * links, and give it a bootstrap lpl so that it can
                 * survive should it need to enter the dispatcher.
                 */
                cp->cpu_next_lpl = cp;
                cp->cpu_prev_lpl = cp;
                cp->cpu_next_lgrp = cp;
                cp->cpu_prev_lgrp = cp;
                cp->cpu_lpl = lpl_bootstrap;

                lgrp_plat_config(event, resource);
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_CPU_DEL:
                lgrp_plat_config(event, resource);
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_CPU_ONLINE:
                cp = (cpu_t *)resource;
                lgrp_cpu_init(cp);
                lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
                rc = lpl_topo_verify(cp->cpu_part);
                if (rc != LPL_TOPO_CORRECT) {
                        panic("lpl_topo_verify failed: %d", rc);
                }
                lgrp_plat_config(event, resource);
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_CPU_OFFLINE:
                cp = (cpu_t *)resource;
                id = cp->cpu_lpl->lpl_lgrpid;
                lgrp_part_del_cpu(cp);
                lgrp_cpu_fini(cp, id);
                rc = lpl_topo_verify(cp->cpu_part);
                if (rc != LPL_TOPO_CORRECT) {
                        panic("lpl_topo_verify failed: %d", rc);
                }
                lgrp_plat_config(event, resource);
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_CPUPART_ADD:
                cp = (cpu_t *)resource;
                lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
                rc = lpl_topo_verify(cp->cpu_part);
                if (rc != LPL_TOPO_CORRECT) {
                        panic("lpl_topo_verify failed: %d", rc);
                }
                lgrp_plat_config(event, resource);

                break;
        case LGRP_CONFIG_CPUPART_DEL:
                cp = (cpu_t *)resource;
                lgrp_part_del_cpu((cpu_t *)resource);
                rc = lpl_topo_verify(cp->cpu_part);
                if (rc != LPL_TOPO_CORRECT) {
                        panic("lpl_topo_verify failed: %d", rc);
                }
                lgrp_plat_config(event, resource);

                break;
        /*
         * The following events are initiated by the memnode
         * subsystem.
         */
        case LGRP_CONFIG_MEM_ADD:
                lgrp_mem_init((int)resource, where, B_FALSE);
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_MEM_DEL:
                lgrp_mem_fini((int)resource, where, B_FALSE);
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_MEM_RENAME: {
                lgrp_config_mem_rename_t *ren_arg =
                    (lgrp_config_mem_rename_t *)where;

                lgrp_mem_rename((int)resource,
                    ren_arg->lmem_rename_from,
                    ren_arg->lmem_rename_to);
                atomic_inc_32(&lgrp_gen);

                break;
        }
        case LGRP_CONFIG_GEN_UPDATE:
                atomic_inc_32(&lgrp_gen);

                break;
        case LGRP_CONFIG_FLATTEN:
                if (where == 0)
                        lgrp_topo_levels = (int)resource;
                else
                        (void) lgrp_topo_flatten(resource,
                            lgrp_table, lgrp_alloc_max, &changed);

                break;
        /*
         * Update any lgroups with old latency to new latency
         */
        case LGRP_CONFIG_LAT_CHANGE_ALL:
                lgrp_latency_change(LGRP_NULL_HANDLE, (u_longlong_t)resource,
                    (u_longlong_t)where);

                break;
        /*
         * Update lgroup with specified lgroup platform handle to have
         * new latency
         */
        case LGRP_CONFIG_LAT_CHANGE:
                lgrp_latency_change((lgrp_handle_t)resource, 0,
                    (u_longlong_t)where);

                break;
        case LGRP_CONFIG_NOP:

                break;
        default:
                break;
        }

}

/*
 * Called to add lgrp info into cpu structure from cpu_add_unit;
 * do not assume cpu is in cpu[] yet!
 *
 * CPUs are brought online with all other CPUs paused so we can't
 * allocate memory or we could deadlock the system, so we rely on
 * the platform to statically allocate as much space as we need
 * for the lgrp structs and stats.
 */
static void
lgrp_cpu_init(struct cpu *cp)
{
        klgrpset_t      changed;
        int             count;
        lgrp_handle_t   hand;
        int             first_cpu;
        lgrp_t          *my_lgrp;
        lgrp_id_t       lgrpid;
        struct cpu      *cptr;

        /*
         * This is the first time through if the resource set
         * for the root lgroup is empty. After cpu0 has been
         * initially added to an lgroup, the root's CPU resource
         * set can never be empty, since the system's last CPU
         * cannot be offlined.
         */
        if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
                /*
                 * First time through.
                 */
                first_cpu = 1;
        } else {
                /*
                 * If cpu0 needs to move lgroups, we may come
                 * through here again, at which time cpu_lock won't
                 * be held, and lgrp_initialized will be false.
                 */
                ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
                ASSERT(cp->cpu_part != NULL);
                first_cpu = 0;
        }

        hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
        my_lgrp = lgrp_hand_to_lgrp(hand);

        if (my_lgrp == NULL) {
                /*
                 * Create new lgrp and add it to lgroup topology
                 */
                my_lgrp = lgrp_create();
                my_lgrp->lgrp_plathand = hand;
                my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
                lgrpid = my_lgrp->lgrp_id;
                klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
                klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);

                count = 0;
                klgrpset_clear(changed);
                count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
                    &changed);
                /*
                 * May have added new intermediate lgroups, so need to add
                 * resources other than CPUs which are added below
                 */
                (void) lgrp_mnode_update(changed, NULL);
        } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
            > 0) {
                /*
                 * Leaf lgroup was created, but latency wasn't available
                 * then.  So, set latency for it and fill in rest of lgroup
                 * topology  now that we know how far it is from other leaf
                 * lgroups.
                 */
                lgrpid = my_lgrp->lgrp_id;
                klgrpset_clear(changed);
                if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
                    lgrpid))
                        klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
                count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
                    &changed);

                /*
                 * May have added new intermediate lgroups, so need to add
                 * resources other than CPUs which are added below
                 */
                (void) lgrp_mnode_update(changed, NULL);
        } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
            my_lgrp->lgrp_id)) {
                int     i;

                /*
                 * Update existing lgroup and lgroups containing it with CPU
                 * resource
                 */
                lgrpid = my_lgrp->lgrp_id;
                klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp_t          *lgrp;

                        lgrp = lgrp_table[i];
                        if (!LGRP_EXISTS(lgrp) ||
                            !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
                                continue;

                        klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
                }
        }

        lgrpid = my_lgrp->lgrp_id;
        cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];

        /*
         * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
         * end up in lpl for lgroup 0 whether it is supposed to be in there or
         * not since none of lgroup IDs in the lpl's have been set yet.
         */
        if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
                cp->cpu_lpl->lpl_lgrpid = lgrpid;

        /*
         * link the CPU into the lgrp's CPU list
         */
        if (my_lgrp->lgrp_cpucnt == 0) {
                my_lgrp->lgrp_cpu = cp;
                cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
        } else {
                cptr = my_lgrp->lgrp_cpu;
                cp->cpu_next_lgrp = cptr;
                cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
                cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
                cptr->cpu_prev_lgrp = cp;
        }
        my_lgrp->lgrp_cpucnt++;
}

lgrp_t *
lgrp_create(void)
{
        lgrp_t          *my_lgrp;
        lgrp_id_t       lgrpid;
        int             i;

        ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
        lgrpid = 0;

        /*
         * Find an open slot in the lgroup table and recycle unused lgroup
         * left there if any
         */
        my_lgrp = NULL;
        if (lgrp_alloc_hint == -1)
                /*
                 * Allocate from end when hint not set yet because no lgroups
                 * have been deleted yet
                 */
                lgrpid = nlgrps++;
        else {
                /*
                 * Start looking for next open slot from hint and leave hint
                 * at slot allocated
                 */
                for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
                        my_lgrp = lgrp_table[i];
                        if (!LGRP_EXISTS(my_lgrp)) {
                                lgrpid = i;
                                nlgrps++;
                                break;
                        }
                }
                lgrp_alloc_hint = lgrpid;
        }

        /*
         * Keep track of max lgroup ID allocated so far to cut down on searches
         */
        if (lgrpid > lgrp_alloc_max)
                lgrp_alloc_max = lgrpid;

        /*
         * Need to allocate new lgroup if next open slot didn't have one
         * for recycling
         */
        if (my_lgrp == NULL)
                my_lgrp = lgrp_plat_alloc(lgrpid);

        if (nlgrps > nlgrpsmax || my_lgrp == NULL)
                panic("Too many lgrps for platform (%d)", nlgrps);

        my_lgrp->lgrp_id = lgrpid;
        my_lgrp->lgrp_latency = 0;
        my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
        my_lgrp->lgrp_parent = NULL;
        my_lgrp->lgrp_childcnt = 0;
        my_lgrp->lgrp_mnodes = (mnodeset_t)0;
        my_lgrp->lgrp_nmnodes = 0;
        klgrpset_clear(my_lgrp->lgrp_children);
        klgrpset_clear(my_lgrp->lgrp_leaves);
        for (i = 0; i < LGRP_RSRC_COUNT; i++)
                klgrpset_clear(my_lgrp->lgrp_set[i]);

        my_lgrp->lgrp_cpu = NULL;
        my_lgrp->lgrp_cpucnt = 0;

        if (my_lgrp->lgrp_kstat != NULL)
                lgrp_kstat_reset(lgrpid);

        lgrp_table[my_lgrp->lgrp_id] = my_lgrp;

        return (my_lgrp);
}

void
lgrp_destroy(lgrp_t *lgrp)
{
        int             i;

        /*
         * Unless this lgroup is being destroyed on behalf of
         * the boot CPU, cpu_lock must be held
         */
        ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));

        if (nlgrps == 1)
                cmn_err(CE_PANIC, "Can't destroy only lgroup!");

        if (!LGRP_EXISTS(lgrp))
                return;

        /*
         * Set hint to lgroup being deleted and try to keep lower numbered
         * hints to facilitate finding empty slots
         */
        if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
                lgrp_alloc_hint = lgrp->lgrp_id;

        /*
         * Mark this lgroup to be recycled by setting its lgroup ID to
         * LGRP_NONE and clear relevant fields
         */
        lgrp->lgrp_id = LGRP_NONE;
        lgrp->lgrp_latency = 0;
        lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
        lgrp->lgrp_parent = NULL;
        lgrp->lgrp_childcnt = 0;

        klgrpset_clear(lgrp->lgrp_children);
        klgrpset_clear(lgrp->lgrp_leaves);
        for (i = 0; i < LGRP_RSRC_COUNT; i++)
                klgrpset_clear(lgrp->lgrp_set[i]);

        lgrp->lgrp_mnodes = (mnodeset_t)0;
        lgrp->lgrp_nmnodes = 0;

        lgrp->lgrp_cpu = NULL;
        lgrp->lgrp_cpucnt = 0;

        nlgrps--;
}

/*
 * Initialize kstat data. Called from lgrp intialization code.
 */
static void
lgrp_kstat_init(void)
{
        lgrp_stat_t     stat;

        mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);

        for (stat = 0; stat < LGRP_NUM_STATS; stat++)
                kstat_named_init(&lgrp_kstat_data[stat],
                    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
}

/*
 * initialize an lgrp's kstats if needed
 * called with cpu_lock held but not with cpus paused.
 * we don't tear these down now because we don't know about
 * memory leaving the lgrp yet...
 */

void
lgrp_kstat_create(cpu_t *cp)
{
        kstat_t         *lgrp_kstat;
        lgrp_id_t       lgrpid;
        lgrp_t          *my_lgrp;

        ASSERT(MUTEX_HELD(&cpu_lock));

        lgrpid = cp->cpu_lpl->lpl_lgrpid;
        my_lgrp = lgrp_table[lgrpid];

        if (my_lgrp->lgrp_kstat != NULL)
                return; /* already initialized */

        lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
            KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
            KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);

        if (lgrp_kstat != NULL) {
                lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
                lgrp_kstat->ks_private = my_lgrp;
                lgrp_kstat->ks_data = &lgrp_kstat_data;
                lgrp_kstat->ks_update = lgrp_kstat_extract;
                my_lgrp->lgrp_kstat = lgrp_kstat;
                kstat_install(lgrp_kstat);
        }
}

/*
 * this will do something when we manage to remove now unused lgrps
 */

/* ARGSUSED */
void
lgrp_kstat_destroy(cpu_t *cp)
{
        ASSERT(MUTEX_HELD(&cpu_lock));
}

/*
 * Called when a CPU is off-lined.
 */
static void
lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
{
        lgrp_t *my_lgrp;
        struct cpu *prev;
        struct cpu *next;

        ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);

        prev = cp->cpu_prev_lgrp;
        next = cp->cpu_next_lgrp;

        prev->cpu_next_lgrp = next;
        next->cpu_prev_lgrp = prev;

        /*
         * just because I'm paranoid doesn't mean...
         */

        cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;

        my_lgrp = lgrp_table[lgrpid];
        my_lgrp->lgrp_cpucnt--;

        /*
         * Removing last CPU in lgroup, so update lgroup topology
         */
        if (my_lgrp->lgrp_cpucnt == 0) {
                klgrpset_t      changed;
                int             count;
                int             i;

                my_lgrp->lgrp_cpu = NULL;

                /*
                 * Remove this lgroup from its lgroup CPU resources and remove
                 * lgroup from lgroup topology if it doesn't have any more
                 * resources in it now
                 */
                klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
                if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
                        count = 0;
                        klgrpset_clear(changed);
                        count += lgrp_leaf_delete(my_lgrp, lgrp_table,
                            lgrp_alloc_max + 1, &changed);
                        return;
                }

                /*
                 * This lgroup isn't empty, so just remove it from CPU
                 * resources of any lgroups that contain it as such
                 */
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp_t          *lgrp;

                        lgrp = lgrp_table[i];
                        if (!LGRP_EXISTS(lgrp) ||
                            !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
                            lgrpid))
                                continue;

                        klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
                }
                return;
        }

        if (my_lgrp->lgrp_cpu == cp)
                my_lgrp->lgrp_cpu = next;

}

/*
 * Update memory nodes in target lgroups and return ones that get changed
 */
int
lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
{
        int     count;
        int     i;
        int     j;
        lgrp_t  *lgrp;
        lgrp_t  *lgrp_rsrc;

        count = 0;
        if (changed)
                klgrpset_clear(*changed);

        if (klgrpset_isempty(target))
                return (0);

        /*
         * Find each lgroup in target lgroups
         */
        for (i = 0; i <= lgrp_alloc_max; i++) {
                /*
                 * Skip any lgroups that don't exist or aren't in target group
                 */
                lgrp = lgrp_table[i];
                if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
                        continue;
                }

                /*
                 * Initialize memnodes for intermediate lgroups to 0
                 * and update them from scratch since they may have completely
                 * changed
                 */
                if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
                        lgrp->lgrp_mnodes = (mnodeset_t)0;
                        lgrp->lgrp_nmnodes = 0;
                }

                /*
                 * Update memory nodes of of target lgroup with memory nodes
                 * from each lgroup in its lgroup memory resource set
                 */
                for (j = 0; j <= lgrp_alloc_max; j++) {
                        int     k;

                        /*
                         * Skip any lgroups that don't exist or aren't in
                         * memory resources of target lgroup
                         */
                        lgrp_rsrc = lgrp_table[j];
                        if (!LGRP_EXISTS(lgrp_rsrc) ||
                            !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
                            j))
                                continue;

                        /*
                         * Update target lgroup's memnodes to include memnodes
                         * of this lgroup
                         */
                        for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
                                mnodeset_t      mnode_mask;

                                mnode_mask = (mnodeset_t)1 << k;
                                if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
                                    !(lgrp->lgrp_mnodes & mnode_mask)) {
                                        lgrp->lgrp_mnodes |= mnode_mask;
                                        lgrp->lgrp_nmnodes++;
                                }
                        }
                        count++;
                        if (changed)
                                klgrpset_add(*changed, lgrp->lgrp_id);
                }
        }

        return (count);
}

/*
 * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
 * is moved from one board to another. The "from" and "to" arguments specify the
 * source and the destination of the move.
 *
 * See plat_lgrp_config() for a detailed description of the copy-rename
 * semantics.
 *
 * The lgrp_mem_rename() is called by the platform copy-rename code to update
 * the lgroup topology which is changing as memory moves from one lgroup to
 * another. It removes the mnode from the source lgroup and re-inserts it in the
 * target lgroup.
 *
 * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
 * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
 * copy-rename operation.
 *
 * There is one case which requires special handling. If the system contains
 * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
 * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
 * lgrp_mem_init), but there is a window when the system has no memory in the
 * lgroup hierarchy. If another thread tries to allocate memory during this
 * window, the allocation will fail, although the system has physical memory.
 * This may cause a system panic or a deadlock (some sleeping memory allocations
 * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
 * the mnode back).
 *
 * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
 * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
 * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
 * but it updates the rest of the lgroup topology as if the mnode was actually
 * removed. The lgrp_mem_init() function recognizes that the mnode being
 * inserted represents such a special case and updates the topology
 * appropriately.
 */
void
lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
{
        /*
         * Remove the memory from the source node and add it to the destination
         * node.
         */
        lgrp_mem_fini(mnode, from, B_TRUE);
        lgrp_mem_init(mnode, to, B_TRUE);
}

/*
 * Called to indicate that the lgrp with platform handle "hand" now
 * contains the memory identified by "mnode".
 *
 * LOCKING for this routine is a bit tricky. Usually it is called without
 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
 * callers. During DR of the board containing the caged memory it may be called
 * with cpu_lock already held and CPUs paused.
 *
 * If the insertion is part of the DR copy-rename and the inserted mnode (and
 * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
 * dealing with the special case of DR copy-rename described in
 * lgrp_mem_rename().
 */
void
lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
{
        klgrpset_t      changed;
        int             count;
        int             i;
        lgrp_t          *my_lgrp;
        lgrp_id_t       lgrpid;
        mnodeset_t      mnodes_mask = ((mnodeset_t)1 << mnode);
        boolean_t       drop_lock = B_FALSE;
        boolean_t       need_synch = B_FALSE;

        /*
         * Grab CPU lock (if we haven't already)
         */
        if (!MUTEX_HELD(&cpu_lock)) {
                mutex_enter(&cpu_lock);
                drop_lock = B_TRUE;
        }

        /*
         * This routine may be called from a context where we already
         * hold cpu_lock, and have already paused cpus.
         */
        if (!cpus_paused())
                need_synch = B_TRUE;

        /*
         * Check if this mnode is already configured and return immediately if
         * it is.
         *
         * NOTE: in special case of copy-rename of the only remaining mnode,
         * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
         * recognize this case and continue as usual, but skip the update to
         * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
         * in topology, temporarily introduced by lgrp_mem_fini().
         */
        if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
            lgrp_root->lgrp_mnodes & mnodes_mask) {
                if (drop_lock)
                        mutex_exit(&cpu_lock);
                return;
        }

        /*
         * Update lgroup topology with new memory resources, keeping track of
         * which lgroups change
         */
        count = 0;
        klgrpset_clear(changed);
        my_lgrp = lgrp_hand_to_lgrp(hand);
        if (my_lgrp == NULL) {
                /* new lgrp */
                my_lgrp = lgrp_create();
                lgrpid = my_lgrp->lgrp_id;
                my_lgrp->lgrp_plathand = hand;
                my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
                klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
                klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);

                if (need_synch)
                        pause_cpus(NULL, NULL);
                count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
                    &changed);
                if (need_synch)
                        start_cpus();
        } else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
            > 0) {
                /*
                 * Leaf lgroup was created, but latency wasn't available
                 * then.  So, set latency for it and fill in rest of lgroup
                 * topology  now that we know how far it is from other leaf
                 * lgroups.
                 */
                klgrpset_clear(changed);
                lgrpid = my_lgrp->lgrp_id;
                if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
                    lgrpid))
                        klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
                if (need_synch)
                        pause_cpus(NULL, NULL);
                count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
                    &changed);
                if (need_synch)
                        start_cpus();
        } else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
            my_lgrp->lgrp_id)) {
                /*
                 * Add new lgroup memory resource to existing lgroup
                 */
                lgrpid = my_lgrp->lgrp_id;
                klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
                klgrpset_add(changed, lgrpid);
                count++;
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp_t          *lgrp;

                        lgrp = lgrp_table[i];
                        if (!LGRP_EXISTS(lgrp) ||
                            !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
                                continue;

                        klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
                        klgrpset_add(changed, lgrp->lgrp_id);
                        count++;
                }
        } else {
                if (drop_lock)
                        mutex_exit(&cpu_lock);
                return;
        }

        /*
         * Add memory node to lgroup and remove lgroup from ones that need
         * to be updated
         */
        if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
                my_lgrp->lgrp_mnodes |= mnodes_mask;
                my_lgrp->lgrp_nmnodes++;
        }
        klgrpset_del(changed, lgrpid);

        /*
         * Update memory node information for all lgroups that changed and
         * contain new memory node as a resource
         */
        if (count)
                (void) lgrp_mnode_update(changed, NULL);

        if (drop_lock)
                mutex_exit(&cpu_lock);
}

/*
 * Called to indicate that the lgroup associated with the platform
 * handle "hand" no longer contains given memory node
 *
 * LOCKING for this routine is a bit tricky. Usually it is called without
 * cpu_lock and it must must grab cpu_lock here to prevent racing with other
 * callers. During DR of the board containing the caged memory it may be called
 * with cpu_lock already held and CPUs paused.
 *
 * If the deletion is part of the DR copy-rename and the deleted mnode is the
 * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
 * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
 * the same mnode back into the topology. See lgrp_mem_rename() and
 * lgrp_mem_init() for additional details.
 */
void
lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
{
        klgrpset_t      changed;
        int             count;
        int             i;
        lgrp_t          *my_lgrp;
        lgrp_id_t       lgrpid;
        mnodeset_t      mnodes_mask;
        boolean_t       drop_lock = B_FALSE;
        boolean_t       need_synch = B_FALSE;

        /*
         * Grab CPU lock (if we haven't already)
         */
        if (!MUTEX_HELD(&cpu_lock)) {
                mutex_enter(&cpu_lock);
                drop_lock = B_TRUE;
        }

        /*
         * This routine may be called from a context where we already
         * hold cpu_lock and have already paused cpus.
         */
        if (!cpus_paused())
                need_synch = B_TRUE;

        my_lgrp = lgrp_hand_to_lgrp(hand);

        /*
         * The lgrp *must* be pre-existing
         */
        ASSERT(my_lgrp != NULL);

        /*
         * Delete memory node from lgroups which contain it
         */
        mnodes_mask = ((mnodeset_t)1 << mnode);
        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp_t *lgrp = lgrp_table[i];
                /*
                 * Skip any non-existent lgroups and any lgroups that don't
                 * contain leaf lgroup of memory as a memory resource
                 */
                if (!LGRP_EXISTS(lgrp) ||
                    !(lgrp->lgrp_mnodes & mnodes_mask))
                        continue;

                /*
                 * Avoid removing the last mnode from the root in the DR
                 * copy-rename case. See lgrp_mem_rename() for details.
                 */
                if (is_copy_rename &&
                    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
                        continue;

                /*
                 * Remove memory node from lgroup.
                 */
                lgrp->lgrp_mnodes &= ~mnodes_mask;
                ASSERT(lgrp->lgrp_nmnodes > 0);
                lgrp->lgrp_nmnodes--;
        }
        ASSERT(lgrp_root->lgrp_nmnodes > 0);

        /*
         * Don't need to update lgroup topology if this lgroup still has memory.
         *
         * In the special case of DR copy-rename with the only mnode being
         * removed, the lgrp_mnodes for the root is always non-zero, but we
         * still need to update the lgroup topology.
         */
        if ((my_lgrp->lgrp_nmnodes > 0) &&
            !(is_copy_rename && (my_lgrp == lgrp_root) &&
            (my_lgrp->lgrp_mnodes == mnodes_mask))) {
                if (drop_lock)
                        mutex_exit(&cpu_lock);
                return;
        }

        /*
         * This lgroup does not contain any memory now
         */
        klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);

        /*
         * Remove this lgroup from lgroup topology if it does not contain any
         * resources now
         */
        lgrpid = my_lgrp->lgrp_id;
        count = 0;
        klgrpset_clear(changed);
        if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
                /*
                 * Delete lgroup when no more resources
                 */
                if (need_synch)
                        pause_cpus(NULL, NULL);
                count = lgrp_leaf_delete(my_lgrp, lgrp_table,
                    lgrp_alloc_max + 1, &changed);
                ASSERT(count > 0);
                if (need_synch)
                        start_cpus();
        } else {
                /*
                 * Remove lgroup from memory resources of any lgroups that
                 * contain it as such
                 */
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp_t          *lgrp;

                        lgrp = lgrp_table[i];
                        if (!LGRP_EXISTS(lgrp) ||
                            !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
                            lgrpid))
                                continue;

                        klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
                }
        }
        if (drop_lock)
                mutex_exit(&cpu_lock);
}

/*
 * Return lgroup with given platform handle
 */
lgrp_t *
lgrp_hand_to_lgrp(lgrp_handle_t hand)
{
        int     i;
        lgrp_t  *lgrp;

        if (hand == LGRP_NULL_HANDLE)
                return (NULL);

        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp = lgrp_table[i];
                if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
                        return (lgrp);
        }
        return (NULL);
}

/*
 * Return the home lgroup of the current thread.
 * We must do this with kernel preemption disabled, since we don't want our
 * thread to be re-homed while we're poking around with its lpl, and the lpl
 * should never be NULL.
 *
 * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
 * is enabled because of DR.  Callers can use disable kernel preemption
 * around this call to guarantee that the lgroup will be valid beyond this
 * routine, since kernel preemption can be recursive.
 */
lgrp_t *
lgrp_home_lgrp(void)
{
        lgrp_t  *lgrp;
        lpl_t   *lpl;

        kpreempt_disable();

        lpl = curthread->t_lpl;
        ASSERT(lpl != NULL);
        ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
        ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
        lgrp = lgrp_table[lpl->lpl_lgrpid];

        kpreempt_enable();

        return (lgrp);
}

/*
 * Return ID of home lgroup for given thread
 * (See comments for lgrp_home_lgrp() for special care and handling
 * instructions)
 */
lgrp_id_t
lgrp_home_id(kthread_t *t)
{
        lgrp_id_t       lgrp;
        lpl_t           *lpl;

        ASSERT(t != NULL);
        /*
         * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
         * cannot since the HAT layer can call into this routine to
         * determine the locality for its data structures in the context
         * of a page fault.
         */

        kpreempt_disable();

        lpl = t->t_lpl;
        ASSERT(lpl != NULL);
        ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
        lgrp = lpl->lpl_lgrpid;

        kpreempt_enable();

        return (lgrp);
}

/*
 * Return lgroup containing the physical memory for the given page frame number
 */
lgrp_t *
lgrp_pfn_to_lgrp(pfn_t pfn)
{
        lgrp_handle_t   hand;
        int             i;
        lgrp_t          *lgrp;

        hand = lgrp_plat_pfn_to_hand(pfn);
        if (hand != LGRP_NULL_HANDLE)
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp = lgrp_table[i];
                        if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
                                return (lgrp);
                }
        return (NULL);
}

/*
 * Return lgroup containing the physical memory for the given page frame number
 */
lgrp_t *
lgrp_phys_to_lgrp(u_longlong_t physaddr)
{
        lgrp_handle_t   hand;
        int             i;
        lgrp_t          *lgrp;
        pfn_t           pfn;

        pfn = btop(physaddr);
        hand = lgrp_plat_pfn_to_hand(pfn);
        if (hand != LGRP_NULL_HANDLE)
                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp = lgrp_table[i];
                        if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
                                return (lgrp);
                }
        return (NULL);
}

/*
 * Return the leaf lgroup containing the given CPU
 *
 * The caller needs to take precautions necessary to prevent
 * "cpu", and it's lpl from going away across a call to this function.
 * hint: kpreempt_disable()/kpreempt_enable()
 */
static lgrp_t *
lgrp_cpu_to_lgrp(cpu_t *cpu)
{
        return (cpu->cpu_lpl->lpl_lgrp);
}

/*
 * Return the sum of the partition loads in an lgrp divided by
 * the number of CPUs in the lgrp.  This is our best approximation
 * of an 'lgroup load average' for a useful per-lgroup kstat.
 */
static uint64_t
lgrp_sum_loadavgs(lgrp_t *lgrp)
{
        cpu_t *cpu;
        int ncpu;
        uint64_t loads = 0;

        mutex_enter(&cpu_lock);

        cpu = lgrp->lgrp_cpu;
        ncpu = lgrp->lgrp_cpucnt;

        if (cpu == NULL || ncpu == 0) {
                mutex_exit(&cpu_lock);
                return (0ull);
        }

        do {
                loads += cpu->cpu_lpl->lpl_loadavg;
                cpu = cpu->cpu_next_lgrp;
        } while (cpu != lgrp->lgrp_cpu);

        mutex_exit(&cpu_lock);

        return (loads / ncpu);
}

void
lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
{
        struct lgrp_stats *pstats;

        /*
         * Verify that the caller isn't trying to add to
         * a statistic for an lgroup that has gone away
         */
        if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
                return;

        pstats = &lgrp_stats[lgrpid];
        atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
}

int64_t
lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
{
        uint64_t val;
        struct lgrp_stats *pstats;

        if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
                return ((int64_t)0);

        pstats = &lgrp_stats[lgrpid];
        LGRP_STAT_READ(pstats, stat, val);
        return (val);
}

/*
 * Reset all kstats for lgrp specified by its lgrpid.
 */
static void
lgrp_kstat_reset(lgrp_id_t lgrpid)
{
        lgrp_stat_t stat;

        if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
                return;

        for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
                LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
        }
}

/*
 * Collect all per-lgrp statistics for the lgrp associated with this
 * kstat, and store them in the ks_data array.
 *
 * The superuser can reset all the running counter statistics for an
 * lgrp by writing to any of the lgrp's stats.
 */
static int
lgrp_kstat_extract(kstat_t *ksp, int rw)
{
        lgrp_stat_t             stat;
        struct kstat_named      *ksd;
        lgrp_t                  *lgrp;
        lgrp_id_t               lgrpid;

        lgrp = (lgrp_t *)ksp->ks_private;

        ksd = (struct kstat_named *)ksp->ks_data;
        ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);

        lgrpid = lgrp->lgrp_id;

        if (lgrpid == LGRP_NONE) {
                /*
                 * Return all zeroes as stats for freed lgrp.
                 */
                for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
                        ksd[stat].value.i64 = 0;
                }
                ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
                ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
                ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
                ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
                ksd[stat + LGRP_LOADAVG].value.i64 = 0;
        } else if (rw != KSTAT_WRITE) {
                /*
                 * Handle counter stats
                 */
                for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
                        ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
                }

                /*
                 * Handle kernel data snapshot stats
                 */
                ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
                ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
                    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
                ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
                    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
                ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
                    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
                ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
                ksd[stat + LGRP_LOADAVG_SCALE].value.i64 =
                    lgrp_loadavg_max_effect;
        } else {
                lgrp_kstat_reset(lgrpid);
        }

        return (0);
}

int
lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
{
        cpu_t   *cp;

        mutex_enter(&cpu_lock);

        if ((cp = cpu_get(id)) == NULL) {
                mutex_exit(&cpu_lock);
                return (EINVAL);
        }

        if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
                mutex_exit(&cpu_lock);
                return (EINVAL);
        }

        ASSERT(cp->cpu_lpl != NULL);

        *lp = cp->cpu_lpl->lpl_lgrpid;

        mutex_exit(&cpu_lock);

        return (0);
}

int
lgrp_query_load(processorid_t id, lgrp_load_t *lp)
{
        cpu_t *cp;

        mutex_enter(&cpu_lock);

        if ((cp = cpu_get(id)) == NULL) {
                mutex_exit(&cpu_lock);
                return (EINVAL);
        }

        ASSERT(cp->cpu_lpl != NULL);

        *lp = cp->cpu_lpl->lpl_loadavg;

        mutex_exit(&cpu_lock);

        return (0);
}

/*
 * Add a resource named by lpl_leaf to rset of lpl_target
 *
 * This routine also adjusts ncpu and nrset if the call succeeds in adding a
 * resource. It is adjusted here, as this is presently the only place that we
 * can be certain a resource addition has succeeded.
 *
 * We keep the list of rsets sorted so that the dispatcher can quickly walk the
 * list in order until it reaches a NULL.  (This list is required to be NULL
 * terminated, too).  This is done so that we can mark start pos + 1, so that
 * each lpl is traversed sequentially, but in a different order.  We hope this
 * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
 */

void
lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
{
        int             i;
        int             entry_slot = 0;

        /* return if leaf is already present */
        for (i = 0; i < lpl_target->lpl_nrset; i++) {
                if (lpl_target->lpl_rset[i] == lpl_leaf) {
                        return;
                }

                if (lpl_target->lpl_rset[i]->lpl_lgrpid >
                    lpl_leaf->lpl_lgrpid) {
                        break;
                }
        }

        /* insert leaf, update counts */
        entry_slot = i;
        i = lpl_target->lpl_nrset++;

        /*
         * Start at the end of the rset array and work backwards towards the
         * slot into which the new lpl will be inserted. This effectively
         * preserves the current ordering by scooting everybody over one entry,
         * and placing the new entry into the space created.
         */
        while (i-- > entry_slot) {
                lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
                lpl_target->lpl_id2rset[lpl_target->lpl_rset[i]->lpl_lgrpid] =
                    i + 1;
        }

        lpl_target->lpl_rset[entry_slot] = lpl_leaf;
        lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = entry_slot;

        lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
}

/*
 * Update each of lpl_parent's children with a reference to their parent.
 * The lgrp topology is used as the reference since it is fully
 * consistent and correct at this point.
 * This should be called after any potential change in lpl_parent's
 * rset.
 */
static void
lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
{
        klgrpset_t      children;
        int             i;

        children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
        if (klgrpset_isempty(children))
                return; /* nothing to do */

        for (i = 0; i <= lgrp_alloc_max; i++) {
                if (klgrpset_ismember(children, i)) {
                        /*
                         * (Re)set the parent. It may be incorrect if
                         * lpl_parent is new in the topology.
                         */
                        cp->cp_lgrploads[i].lpl_parent = lpl_parent;
                }
        }
}

/*
 * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
 *
 * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
 * resource. The values are adjusted here, as this is the only place that we can
 * be certain a resource was successfully deleted.
 */
void
lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
{
        int i;
        lpl_t *leaf;

        if (lpl_target->lpl_nrset == 0)
                return;

        /* find leaf in intermediate node */
        for (i = 0; i < lpl_target->lpl_nrset; i++) {
                if (lpl_target->lpl_rset[i] == lpl_leaf)
                        break;
        }

        /* return if leaf not found */
        if (lpl_target->lpl_rset[i] != lpl_leaf)
                return;

        /* prune leaf, compress array */
        lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
        lpl_target->lpl_id2rset[lpl_leaf->lpl_lgrpid] = -1;
        lpl_target->lpl_ncpu--;
        do {
                lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
                /*
                 * Update the lgrp id <=> rset mapping
                 */
                if ((leaf = lpl_target->lpl_rset[i]) != NULL) {
                        lpl_target->lpl_id2rset[leaf->lpl_lgrpid] = i;
                }
        } while (i++ < lpl_target->lpl_nrset);
}

/*
 * Check to see if the resource set of the target lpl contains the
 * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
 */

int
lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
{
        int i;

        for (i = 0; i < lpl_target->lpl_nrset; i++) {
                if (lpl_target->lpl_rset[i] == lpl_leaf)
                        return (1);
        }

        return (0);
}

/*
 * Called when we change cpu lpl membership.  This increments or decrements the
 * per-cpu counter in every lpl in which our leaf appears.
 */
void
lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
{
        cpupart_t       *cpupart;
        lgrp_t          *lgrp_leaf;
        lgrp_t          *lgrp_cur;
        lpl_t           *lpl_leaf;
        lpl_t           *lpl_cur;
        int             i;

        ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);

        cpupart = cp->cpu_part;
        lpl_leaf = cp->cpu_lpl;
        lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];

        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp_cur = lgrp_table[i];

                /*
                 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
                 * for the cpu in question, or if the current lgrp and leaf
                 * don't share the same resources.
                 */

                if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
                    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
                    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
                        continue;


                lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];

                if (lpl_cur->lpl_nrset > 0) {
                        if (act == LPL_INCREMENT) {
                                lpl_cur->lpl_ncpu++;
                        } else if (act == LPL_DECREMENT) {
                                lpl_cur->lpl_ncpu--;
                        }
                }
        }
}

/*
 * Initialize lpl with given resources and specified lgrp
 */
void
lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
{
        lpl->lpl_lgrpid = lgrp->lgrp_id;
        lpl->lpl_loadavg = 0;
        if (lpl == lpl_leaf)
                lpl->lpl_ncpu = 1;
        else
                lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
        lpl->lpl_nrset = 1;
        lpl->lpl_rset[0] = lpl_leaf;
        lpl->lpl_id2rset[lpl_leaf->lpl_lgrpid] = 0;
        lpl->lpl_lgrp = lgrp;
        lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
        lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
}

/*
 * Clear an unused lpl
 */
void
lpl_clear(lpl_t *lpl)
{
        /*
         * Clear out all fields in the lpl except:
         *    lpl_lgrpid - to facilitate debugging
         *    lpl_rset, lpl_rset_sz, lpl_id2rset - rset array references / size
         *
         * Note that the lpl's rset and id2rset mapping are cleared as well.
         */
        lpl->lpl_loadavg = 0;
        lpl->lpl_ncpu = 0;
        lpl->lpl_lgrp = NULL;
        lpl->lpl_parent = NULL;
        lpl->lpl_cpus = NULL;
        lpl->lpl_nrset = 0;
        lpl->lpl_homed_time = 0;
        bzero(lpl->lpl_rset, sizeof (lpl->lpl_rset[0]) * lpl->lpl_rset_sz);
        bzero(lpl->lpl_id2rset,
            sizeof (lpl->lpl_id2rset[0]) * lpl->lpl_rset_sz);
}

/*
 * Given a CPU-partition, verify that the lpl topology in the CPU-partition
 * is in sync with the lgroup toplogy in the system.  The lpl topology may not
 * make full use of all of the lgroup topology, but this checks to make sure
 * that for the parts that it does use, it has correctly understood the
 * relationships that exist. This function returns
 * 0 if the topology is correct, and a non-zero error code, for non-debug
 * kernels if incorrect.  Asserts are spread throughout the code to aid in
 * debugging on a DEBUG kernel.
 */
int
lpl_topo_verify(cpupart_t *cpupart)
{
        lgrp_t          *lgrp;
        lpl_t           *lpl;
        klgrpset_t      rset;
        klgrpset_t      cset;
        cpu_t           *cpu;
        cpu_t           *cp_start;
        int             i;
        int             j;
        int             sum;

        /* topology can't be incorrect if it doesn't exist */
        if (!lgrp_topo_initialized || !lgrp_initialized)
                return (LPL_TOPO_CORRECT);

        ASSERT(cpupart != NULL);

        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp = lgrp_table[i];
                lpl = NULL;
                /* make sure lpls are allocated */
                ASSERT(cpupart->cp_lgrploads);
                if (!cpupart->cp_lgrploads)
                        return (LPL_TOPO_PART_HAS_NO_LPL);

                lpl = &cpupart->cp_lgrploads[i];
                /* make sure our index is good */
                ASSERT(i < cpupart->cp_nlgrploads);

                /* if lgroup doesn't exist, make sure lpl is empty */
                if (!LGRP_EXISTS(lgrp)) {
                        ASSERT(lpl->lpl_ncpu == 0);
                        if (lpl->lpl_ncpu > 0) {
                                return (LPL_TOPO_CPUS_NOT_EMPTY);
                        } else {
                                continue;
                        }
                }

                /* verify that lgroup and lpl are identically numbered */
                ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);

                /* if lgroup isn't in our partition, make sure lpl is empty */
                if (!klgrpset_intersects(lgrp->lgrp_leaves,
                    cpupart->cp_lgrpset)) {
                        ASSERT(lpl->lpl_ncpu == 0);
                        if (lpl->lpl_ncpu > 0) {
                                return (LPL_TOPO_CPUS_NOT_EMPTY);
                        }
                        /*
                         * lpl is empty, and lgroup isn't in partition.  verify
                         * that lpl doesn't show up in anyone else's rsets (in
                         * this partition, anyway)
                         */
                        for (j = 0; j < cpupart->cp_nlgrploads; j++) {
                                lpl_t *i_lpl; /* lpl we're iterating over */

                                i_lpl = &cpupart->cp_lgrploads[j];

                                ASSERT(!lpl_rset_contains(i_lpl, lpl));
                                if (lpl_rset_contains(i_lpl, lpl)) {
                                        return (LPL_TOPO_LPL_ORPHANED);
                                }
                        }
                        /* lgroup is empty, and everything is ok. continue */
                        continue;
                }


                /* lgroup is in this partition, now check it against lpl */

                /* do both have matching lgrps? */
                ASSERT(lgrp == lpl->lpl_lgrp);
                if (lgrp != lpl->lpl_lgrp) {
                        return (LPL_TOPO_LGRP_MISMATCH);
                }

                /* do the parent lgroups exist and do they match? */
                if (lgrp->lgrp_parent) {
                        ASSERT(lpl->lpl_parent != NULL &&
                            lgrp->lgrp_parent->lgrp_id ==
                            lpl->lpl_parent->lpl_lgrpid);

                        if (!lpl->lpl_parent) {
                                return (LPL_TOPO_MISSING_PARENT);
                        } else if (lgrp->lgrp_parent->lgrp_id !=
                            lpl->lpl_parent->lpl_lgrpid) {
                                return (LPL_TOPO_PARENT_MISMATCH);
                        }
                }

                /* only leaf lgroups keep a cpucnt, only check leaves */
                if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {

                        /* verify that lgrp is also a leaf */
                        ASSERT((lgrp->lgrp_childcnt == 0) &&
                            (klgrpset_ismember(lgrp->lgrp_leaves,
                            lpl->lpl_lgrpid)));

                        if ((lgrp->lgrp_childcnt > 0) ||
                            (!klgrpset_ismember(lgrp->lgrp_leaves,
                            lpl->lpl_lgrpid))) {
                                return (LPL_TOPO_LGRP_NOT_LEAF);
                        }

                        ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
                            (lpl->lpl_ncpu > 0));
                        if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
                            (lpl->lpl_ncpu <= 0)) {
                                return (LPL_TOPO_BAD_CPUCNT);
                        }

                        /*
                         * Check that lpl_ncpu also matches the number of
                         * cpus in the lpl's linked list.  This only exists in
                         * leaves, but they should always match.
                         */
                        j = 0;
                        cpu = cp_start = lpl->lpl_cpus;
                        while (cpu != NULL) {
                                j++;

                                /* check to make sure cpu's lpl is leaf lpl */
                                ASSERT(cpu->cpu_lpl == lpl);
                                if (cpu->cpu_lpl != lpl) {
                                        return (LPL_TOPO_CPU_HAS_BAD_LPL);
                                }

                                /* check next cpu */
                                if ((cpu = cpu->cpu_next_lpl) != cp_start) {
                                        continue;
                                } else {
                                        cpu = NULL;
                                }
                        }

                        ASSERT(j == lpl->lpl_ncpu);
                        if (j != lpl->lpl_ncpu) {
                                return (LPL_TOPO_LPL_BAD_NCPU);
                        }

                        /*
                         * Also, check that leaf lpl is contained in all
                         * intermediate lpls that name the leaf as a descendant
                         */
                        for (j = 0; j <= lgrp_alloc_max; j++) {
                                klgrpset_t intersect;
                                lgrp_t *lgrp_cand;
                                lpl_t *lpl_cand;

                                lgrp_cand = lgrp_table[j];
                                intersect = klgrpset_intersects(
                                    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
                                    cpupart->cp_lgrpset);

                                if (!LGRP_EXISTS(lgrp_cand) ||
                                    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
                                    cpupart->cp_lgrpset) ||
                                    (intersect == 0))
                                        continue;

                                lpl_cand =
                                    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];

                                if (klgrpset_ismember(intersect,
                                    lgrp->lgrp_id)) {
                                        ASSERT(lpl_rset_contains(lpl_cand,
                                            lpl));

                                        if (!lpl_rset_contains(lpl_cand, lpl)) {
                                                return (LPL_TOPO_RSET_MSSNG_LF);
                                        }
                                }
                        }

                } else { /* non-leaf specific checks */

                        /*
                         * Non-leaf lpls should have lpl_cpus == NULL
                         * verify that this is so
                         */
                        ASSERT(lpl->lpl_cpus == NULL);
                        if (lpl->lpl_cpus != NULL) {
                                return (LPL_TOPO_NONLEAF_HAS_CPUS);
                        }

                        /*
                         * verify that the sum of the cpus in the leaf resources
                         * is equal to the total ncpu in the intermediate
                         */
                        for (j = sum = 0; j < lpl->lpl_nrset; j++) {
                                sum += lpl->lpl_rset[j]->lpl_ncpu;
                        }

                        ASSERT(sum == lpl->lpl_ncpu);
                        if (sum != lpl->lpl_ncpu) {
                                return (LPL_TOPO_LPL_BAD_NCPU);
                        }
                }

                /*
                 * Check the rset of the lpl in question.  Make sure that each
                 * rset contains a subset of the resources in
                 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
                 * sure that each rset doesn't include resources that are
                 * outside of that set.  (Which would be resources somehow not
                 * accounted for).
                 */
                klgrpset_clear(rset);
                for (j = 0; j < lpl->lpl_nrset; j++) {
                        klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
                }
                klgrpset_copy(cset, rset);
                /* make sure lpl rset matches lgrp rset */
                klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
                /* make sure rset is contained with in partition, too */
                klgrpset_diff(cset, cpupart->cp_lgrpset);

                ASSERT(klgrpset_isempty(rset) && klgrpset_isempty(cset));
                if (!klgrpset_isempty(rset) || !klgrpset_isempty(cset)) {
                        return (LPL_TOPO_RSET_MISMATCH);
                }

                /*
                 * check to make sure lpl_nrset matches the number of rsets
                 * contained in the lpl
                 */
                for (j = 0; j < lpl->lpl_nrset; j++) {
                        if (lpl->lpl_rset[j] == NULL)
                                break;
                }

                ASSERT(j == lpl->lpl_nrset);
                if (j != lpl->lpl_nrset) {
                        return (LPL_TOPO_BAD_RSETCNT);
                }

        }
        return (LPL_TOPO_CORRECT);
}

/*
 * Flatten lpl topology to given number of levels.  This is presently only
 * implemented for a flatten to 2 levels, which will prune out the intermediates
 * and home the leaf lpls to the root lpl.
 */
int
lpl_topo_flatten(int levels)
{
        int             i;
        uint_t          sum;
        lgrp_t          *lgrp_cur;
        lpl_t           *lpl_cur;
        lpl_t           *lpl_root;
        cpupart_t       *cp;

        if (levels != 2)
                return (0);

        /* called w/ cpus paused - grab no locks! */
        ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
            !lgrp_initialized);

        cp = cp_list_head;
        do {
                lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
                ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));

                for (i = 0; i <= lgrp_alloc_max; i++) {
                        lgrp_cur = lgrp_table[i];
                        lpl_cur = &cp->cp_lgrploads[i];

                        if ((lgrp_cur == lgrp_root) ||
                            (!LGRP_EXISTS(lgrp_cur) &&
                            (lpl_cur->lpl_ncpu == 0)))
                                continue;

                        if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
                                /*
                                 * this should be a deleted intermediate, so
                                 * clear it
                                 */
                                lpl_clear(lpl_cur);
                        } else if ((lpl_cur->lpl_nrset == 1) &&
                            (lpl_cur->lpl_rset[0] == lpl_cur) &&
                            ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
                            (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
                                /*
                                 * this is a leaf whose parent was deleted, or
                                 * whose parent had their lgrp deleted.  (And
                                 * whose parent will soon be deleted).  Point
                                 * this guy back to the root lpl.
                                 */
                                lpl_cur->lpl_parent = lpl_root;
                                lpl_rset_add(lpl_root, lpl_cur);
                        }

                }

                /*
                 * Now that we're done, make sure the count on the root lpl is
                 * correct, and update the hints of the children for the sake of
                 * thoroughness
                 */
                for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
                        sum += lpl_root->lpl_rset[i]->lpl_ncpu;
                }
                lpl_root->lpl_ncpu = sum;
                lpl_child_update(lpl_root, cp);

                cp = cp->cp_next;
        } while (cp != cp_list_head);

        return (levels);
}

/*
 * Insert a lpl into the resource hierarchy and create any additional lpls that
 * are necessary to represent the varying states of locality for the cpu
 * resoruces newly added to the partition.
 *
 * This routine is clever enough that it can correctly add resources from the
 * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
 * those for which the lpl is a leaf as opposed to simply a named equally local
 * resource).  The one special case that needs additional processing is when a
 * new intermediate lpl is introduced.  Since the main loop only traverses
 * looking to add the leaf resource where it does not yet exist, additional work
 * is necessary to add other leaf resources that may need to exist in the newly
 * created intermediate.  This is performed by the second inner loop, and is
 * only done when the check for more than one overlapping resource succeeds.
 */

void
lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
{
        int             i;
        int             j;
        int             rset_num_intersect;
        lgrp_t          *lgrp_cur;
        lpl_t           *lpl_cur;
        lpl_t           *lpl_parent;
        lgrp_id_t       parent_id;
        klgrpset_t      rset_intersect; /* resources in cpupart and lgrp */

        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp_cur = lgrp_table[i];

                /*
                 * Don't insert if the lgrp isn't there, if the leaf isn't
                 * contained within the current lgrp, or if the current lgrp has
                 * no leaves in this partition
                 */

                if (!LGRP_EXISTS(lgrp_cur) ||
                    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
                    lpl_leaf->lpl_lgrpid) ||
                    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
                    cpupart->cp_lgrpset))
                        continue;

                lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
                if (lgrp_cur->lgrp_parent != NULL) {
                        /* if lgrp has a parent, assign it properly */
                        parent_id = lgrp_cur->lgrp_parent->lgrp_id;
                        lpl_parent = &cpupart->cp_lgrploads[parent_id];
                } else {
                        /* if not, make sure parent ptr gets set to null */
                        lpl_parent = NULL;
                }

                if (lpl_cur == lpl_leaf) {
                        /*
                         * Almost all leaf state was initialized elsewhere.  The
                         * only thing left to do is to set the parent.
                         */
                        lpl_cur->lpl_parent = lpl_parent;
                        continue;
                }

                lpl_clear(lpl_cur);
                lpl_init(lpl_cur, lpl_leaf, lgrp_cur);

                lpl_cur->lpl_parent = lpl_parent;

                /* does new lpl need to be populated with other resources? */
                rset_intersect =
                    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
                    cpupart->cp_lgrpset);
                klgrpset_nlgrps(rset_intersect, rset_num_intersect);

                if (rset_num_intersect > 1) {
                        /*
                         * If so, figure out what lpls have resources that
                         * intersect this one, and add them.
                         */
                        for (j = 0; j <= lgrp_alloc_max; j++) {
                                lgrp_t  *lgrp_cand;     /* candidate lgrp */
                                lpl_t   *lpl_cand;      /* candidate lpl */

                                lgrp_cand = lgrp_table[j];
                                if (!LGRP_EXISTS(lgrp_cand) ||
                                    !klgrpset_ismember(rset_intersect,
                                    lgrp_cand->lgrp_id))
                                        continue;
                                lpl_cand =
                                    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
                                lpl_rset_add(lpl_cur, lpl_cand);
                        }
                }
                /*
                 * This lpl's rset has changed. Update the hint in it's
                 * children.
                 */
                lpl_child_update(lpl_cur, cpupart);
        }
}

/*
 * remove a lpl from the hierarchy of resources, clearing its state when
 * finished.  If the lpls at the intermediate levels of the hierarchy have no
 * remaining resources, or no longer name a leaf resource in the cpu-partition,
 * delete them as well.
 */

void
lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
{
        int             i;
        lgrp_t          *lgrp_cur;
        lpl_t           *lpl_cur;
        klgrpset_t      leaf_intersect; /* intersection of leaves */

        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp_cur = lgrp_table[i];

                /*
                 * Don't attempt to remove from lgrps that aren't there, that
                 * don't contain our leaf, or from the leaf itself. (We do that
                 * later)
                 */

                if (!LGRP_EXISTS(lgrp_cur))
                        continue;

                lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];

                if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
                    lpl_leaf->lpl_lgrpid) ||
                    (lpl_cur == lpl_leaf)) {
                        continue;
                }

                /*
                 * This is a slightly sleazy simplification in that we have
                 * already marked the cp_lgrpset as no longer containing the
                 * leaf we've deleted.  Any lpls that pass the above checks
                 * based upon lgrp membership but not necessarily cpu-part
                 * membership also get cleared by the checks below.  Currently
                 * this is harmless, as the lpls should be empty anyway.
                 *
                 * In particular, we want to preserve lpls that have additional
                 * leaf resources, even though we don't yet have a processor
                 * architecture that represents resources this way.
                 */

                leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
                    cpupart->cp_lgrpset);

                lpl_rset_del(lpl_cur, lpl_leaf);
                if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
                        lpl_clear(lpl_cur);
                } else {
                        /*
                         * Update this lpl's children
                         */
                        lpl_child_update(lpl_cur, cpupart);
                }
        }
        lpl_clear(lpl_leaf);
}

/*
 * add a cpu to a partition in terms of lgrp load avg bookeeping
 *
 * The lpl (cpu partition load average information) is now arranged in a
 * hierarchical fashion whereby resources that are closest, ie. most local, to
 * the cpu in question are considered to be leaves in a tree of resources.
 * There are two general cases for cpu additon:
 *
 * 1. A lpl structure that contains resources already in the hierarchy tree.
 * In this case, all of the associated lpl relationships have been defined, and
 * all that is necessary is that we link the new cpu into the per-lpl list of
 * cpus, and increment the ncpu count of all places where this cpu resource will
 * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
 * pushing is accomplished by this routine.
 *
 * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
 * not exist yet.  In this case, it is necessary to build the leaf lpl, and
 * construct the hierarchy of state necessary to name it's more distant
 * resources, if they should exist.  The leaf structure is initialized by this
 * routine, as is the cpu-partition state for the lgrp membership.  This routine
 * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
 * and builds all of the "ancestoral" state necessary to identify resources at
 * differing levels of locality.
 */
void
lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
{
        cpupart_t       *cpupart;
        lgrp_t          *lgrp_leaf;
        lpl_t           *lpl_leaf;

        /* called sometimes w/ cpus paused - grab no locks */
        ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);

        cpupart = cp->cpu_part;
        lgrp_leaf = lgrp_table[lgrpid];

        /* don't add non-existent lgrp */
        ASSERT(LGRP_EXISTS(lgrp_leaf));
        lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
        cp->cpu_lpl = lpl_leaf;

        /* only leaf lpls contain cpus */

        if (lpl_leaf->lpl_ncpu++ == 0) {
                lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
                klgrpset_add(cpupart->cp_lgrpset, lgrpid);
                lpl_leaf_insert(lpl_leaf, cpupart);
        } else {
                /*
                 * the lpl should already exist in the parent, so just update
                 * the count of available CPUs
                 */
                lpl_cpu_adjcnt(LPL_INCREMENT, cp);
        }

        /* link cpu into list of cpus in lpl */

        if (lpl_leaf->lpl_cpus) {
                cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
                cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
                lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
                lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
        } else {
                /*
                 * We increment ncpu immediately after we create a new leaf
                 * lpl, so assert that ncpu == 1 for the case where we don't
                 * have any cpu pointers yet.
                 */
                ASSERT(lpl_leaf->lpl_ncpu == 1);
                lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
        }

}


/*
 * remove a cpu from a partition in terms of lgrp load avg bookeeping
 *
 * The lpl (cpu partition load average information) is now arranged in a
 * hierarchical fashion whereby resources that are closest, ie. most local, to
 * the cpu in question are considered to be leaves in a tree of resources.
 * There are two removal cases in question:
 *
 * 1. Removal of the resource in the leaf leaves other resources remaining in
 * that leaf.  (Another cpu still exists at this level of locality).  In this
 * case, the count of available cpus is decremented in all assocated lpls by
 * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
 * from the per-cpu lpl list.
 *
 * 2. Removal of the resource results in the lpl containing no resources.  (It's
 * empty)  In this case, all of what has occurred for the first step must take
 * place; however, additionally we must remove the lpl structure itself, prune
 * out any stranded lpls that do not directly name a leaf resource, and mark the
 * cpu partition in question as no longer containing resources from the lgrp of
 * the lpl that has been delted.  Cpu-partition changes are handled by this
 * method, but the lpl_leaf_remove function deals with the details of pruning
 * out the empty lpl and any of its orphaned direct ancestors.
 */
void
lgrp_part_del_cpu(cpu_t *cp)
{
        lpl_t           *lpl;
        lpl_t           *leaf_lpl;
        lgrp_t          *lgrp_leaf;

        /* called sometimes w/ cpus paused - grab no locks */

        ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);

        lpl = leaf_lpl = cp->cpu_lpl;
        lgrp_leaf = leaf_lpl->lpl_lgrp;

        /* don't delete a leaf that isn't there */
        ASSERT(LGRP_EXISTS(lgrp_leaf));

        /* no double-deletes */
        ASSERT(lpl->lpl_ncpu);
        if (--lpl->lpl_ncpu == 0) {
                /*
                 * This was the last cpu in this lgroup for this partition,
                 * clear its bit in the partition's lgroup bitmask
                 */
                klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);

                /* eliminate remaning lpl link pointers in cpu, lpl */
                lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;

                lpl_leaf_remove(leaf_lpl, cp->cpu_part);
        } else {

                /* unlink cpu from lists of cpus in lpl */
                cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
                cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
                if (lpl->lpl_cpus == cp) {
                        lpl->lpl_cpus = cp->cpu_next_lpl;
                }

                /*
                 * Update the cpu count in the lpls associated with parent
                 * lgroups.
                 */
                lpl_cpu_adjcnt(LPL_DECREMENT, cp);

        }
        /* clear cpu's lpl ptr when we're all done */
        cp->cpu_lpl = NULL;
}

/*
 * Recompute load average for the specified partition/lgrp fragment.
 *
 * We rely on the fact that this routine is called from the clock thread
 * at a point before the clock thread can block (i.e. before its first
 * lock request).  Since the clock thread can not be preempted (since it
 * runs at highest priority), we know that cpu partitions can not change
 * (since doing so would require either the repartition requester or the
 * cpu_pause thread to run on this cpu), so we can update the cpu's load
 * without grabbing cpu_lock.
 */
void
lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
{
        uint_t          ncpu;
        int64_t         old, new, f;

        /*
         * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
         */
        static short expval[] = {
            0, 3196, 1618, 1083,
            814, 652, 543, 466,
            408, 363, 326, 297,
            272, 251, 233, 218,
            204, 192, 181, 172,
            163, 155, 148, 142,
            136, 130, 125, 121,
            116, 112, 109, 105
        };

        /* ASSERT (called from clock level) */

        if ((lpl == NULL) ||    /* we're booting - this is easiest for now */
            ((ncpu = lpl->lpl_ncpu) == 0)) {
                return;
        }

        for (;;) {

                if (ncpu >= sizeof (expval) / sizeof (expval[0]))
                        f = expval[1]/ncpu; /* good approx. for large ncpu */
                else
                        f = expval[ncpu];

                /*
                 * Modify the load average atomically to avoid losing
                 * anticipatory load updates (see lgrp_move_thread()).
                 */
                if (ageflag) {
                        /*
                         * We're supposed to both update and age the load.
                         * This happens 10 times/sec. per cpu.  We do a
                         * little hoop-jumping to avoid integer overflow.
                         */
                        int64_t         q, r;

                        do {
                                old = new = lpl->lpl_loadavg;
                                q = (old  >> 16) << 7;
                                r = (old  & 0xffff) << 7;
                                new += ((long long)(nrcpus - q) * f -
                                    ((r * f) >> 16)) >> 7;

                                /*
                                 * Check for overflow
                                 */
                                if (new > LGRP_LOADAVG_MAX)
                                        new = LGRP_LOADAVG_MAX;
                                else if (new < 0)
                                        new = 0;
                        } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
                            old, new) != old);
                } else {
                        /*
                         * We're supposed to update the load, but not age it.
                         * This option is used to update the load (which either
                         * has already been aged in this 1/10 sec. interval or
                         * soon will be) to account for a remotely executing
                         * thread.
                         */
                        do {
                                old = new = lpl->lpl_loadavg;
                                new += f;
                                /*
                                 * Check for overflow
                                 * Underflow not possible here
                                 */
                                if (new < old)
                                        new = LGRP_LOADAVG_MAX;
                        } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
                            old, new) != old);
                }

                /*
                 * Do the same for this lpl's parent
                 */
                if ((lpl = lpl->lpl_parent) == NULL)
                        break;
                ncpu = lpl->lpl_ncpu;
        }
}

/*
 * Initialize lpl topology in the target based on topology currently present in
 * lpl_bootstrap.
 *
 * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
 * initialize cp_default list of lpls. Up to this point all topology operations
 * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
 * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
 * `target' points to the list of lpls in cp_default and `size' is the size of
 * this list.
 *
 * This function walks the lpl topology in lpl_bootstrap and does for things:
 *
 * 1) Copies all fields from lpl_bootstrap to the target.
 *
 * 2) Sets CPU0 lpl pointer to the correct element of the target list.
 *
 * 3) Updates lpl_parent pointers to point to the lpls in the target list
 *    instead of lpl_bootstrap.
 *
 * 4) Updates pointers in the resource list of the target to point to the lpls
 *    in the target list instead of lpl_bootstrap.
 *
 * After lpl_topo_bootstrap() completes, target contains the same information
 * that would be present there if it were used during boot instead of
 * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
 * and it is bzeroed.
 */
void
lpl_topo_bootstrap(lpl_t *target, int size)
{
        lpl_t   *lpl = lpl_bootstrap;
        lpl_t   *target_lpl = target;
        lpl_t   **rset;
        int     *id2rset;
        int     sz;
        int     howmany;
        int     id;
        int     i;

        /*
         * The only target that should be passed here is cp_default lpl list.
         */
        ASSERT(target == cp_default.cp_lgrploads);
        ASSERT(size == cp_default.cp_nlgrploads);
        ASSERT(!lgrp_topo_initialized);
        ASSERT(ncpus == 1);

        howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
        for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
                /*
                 * Copy all fields from lpl, except for the rset,
                 * lgrp id <=> rset mapping storage,
                 * and amount of storage
                 */
                rset = target_lpl->lpl_rset;
                id2rset = target_lpl->lpl_id2rset;
                sz = target_lpl->lpl_rset_sz;

                *target_lpl = *lpl;

                target_lpl->lpl_rset_sz = sz;
                target_lpl->lpl_rset = rset;
                target_lpl->lpl_id2rset = id2rset;

                /*
                 * Substitute CPU0 lpl pointer with one relative to target.
                 */
                if (lpl->lpl_cpus == CPU) {
                        ASSERT(CPU->cpu_lpl == lpl);
                        CPU->cpu_lpl = target_lpl;
                }

                /*
                 * Substitute parent information with parent relative to target.
                 */
                if (lpl->lpl_parent != NULL)
                        target_lpl->lpl_parent = (lpl_t *)
                            (((uintptr_t)lpl->lpl_parent -
                            (uintptr_t)lpl_bootstrap) +
                            (uintptr_t)target);

                /*
                 * Walk over resource set substituting pointers relative to
                 * lpl_bootstrap's rset to pointers relative to target's
                 */
                ASSERT(lpl->lpl_nrset <= 1);

                for (id = 0; id < lpl->lpl_nrset; id++) {
                        if (lpl->lpl_rset[id] != NULL) {
                                target_lpl->lpl_rset[id] = (lpl_t *)
                                    (((uintptr_t)lpl->lpl_rset[id] -
                                    (uintptr_t)lpl_bootstrap) +
                                    (uintptr_t)target);
                        }
                        target_lpl->lpl_id2rset[id] =
                            lpl->lpl_id2rset[id];
                }
        }

        /*
         * Clean up the bootstrap lpls since we have switched over to the
         * actual lpl array in the default cpu partition.
         *
         * We still need to keep one empty lpl around for newly starting
         * slave CPUs to reference should they need to make it through the
         * dispatcher prior to their lgrp/lpl initialization.
         *
         * The lpl related dispatcher code has been designed to work properly
         * (and without extra checks) for this special case of a zero'ed
         * bootstrap lpl. Such an lpl appears to the dispatcher as an lpl
         * with lgrpid 0 and an empty resource set. Iteration over the rset
         * array by the dispatcher is also NULL terminated for this reason.
         *
         * This provides the desired behaviour for an uninitialized CPU.
         * It shouldn't see any other CPU to either dispatch to or steal
         * from until it is properly initialized.
         */
        bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
        bzero(lpl_bootstrap_id2rset, sizeof (lpl_bootstrap_id2rset));
        bzero(lpl_bootstrap_rset, sizeof (lpl_bootstrap_rset));

        lpl_bootstrap_list[0].lpl_rset = lpl_bootstrap_rset;
        lpl_bootstrap_list[0].lpl_id2rset = lpl_bootstrap_id2rset;
}

/*
 * If the lowest load among the lgroups a process' threads are currently
 * spread across is greater than lgrp_expand_proc_thresh, we'll consider
 * expanding the process to a new lgroup.
 */
#define LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
lgrp_load_t     lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;

#define LGRP_EXPAND_PROC_THRESH(ncpu) \
        ((lgrp_expand_proc_thresh) / (ncpu))

/*
 * A process will be expanded to a new lgroup only if the difference between
 * the lowest load on the lgroups the process' thread's are currently spread
 * across and the lowest load on the other lgroups in the process' partition
 * is greater than lgrp_expand_proc_diff.
 */
#define LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
lgrp_load_t     lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;

#define LGRP_EXPAND_PROC_DIFF(ncpu) \
        ((lgrp_expand_proc_diff) / (ncpu))

/*
 * The loadavg tolerance accounts for "noise" inherent in the load, which may
 * be present due to impreciseness of the load average decay algorithm.
 *
 * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
 * tolerance is scaled by the number of cpus in the lgroup just like
 * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
 * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
 * of: 0x10000 / 4 => 0x4000 or greater to be significant.
 */
uint32_t        lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
#define LGRP_LOADAVG_TOLERANCE(ncpu)    \
        ((lgrp_loadavg_tolerance) / ncpu)

/*
 * lgrp_choose() will choose root lgroup as home when lowest lgroup load
 * average is above this threshold
 */
uint32_t        lgrp_load_thresh = UINT32_MAX;

/*
 * lgrp_choose() will try to skip any lgroups with less memory
 * than this free when choosing a home lgroup
 */
pgcnt_t lgrp_mem_free_thresh = 0;

/*
 * When choosing between similarly loaded lgroups, lgrp_choose() will pick
 * one based on one of the following policies:
 * - Random selection
 * - Pseudo round robin placement
 * - Longest time since a thread was last placed
 */
#define LGRP_CHOOSE_RANDOM      1
#define LGRP_CHOOSE_RR          2
#define LGRP_CHOOSE_TIME        3

int     lgrp_choose_policy = LGRP_CHOOSE_TIME;

/*
 * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
 * be bound to a CPU or processor set.
 *
 * Arguments:
 *      t               The thread
 *      cpupart         The partition the thread belongs to.
 *
 * NOTE: Should at least be called with the cpu_lock held, kernel preemption
 *       disabled, or thread_lock held (at splhigh) to protect against the CPU
 *       partitions changing out from under us and assumes that given thread is
 *       protected.  Also, called sometimes w/ cpus paused or kernel preemption
 *       disabled, so don't grab any locks because we should never block under
 *       those conditions.
 */
lpl_t *
lgrp_choose(kthread_t *t, cpupart_t *cpupart)
{
        lgrp_load_t     bestload, bestrload;
        int             lgrpid_offset, lgrp_count;
        lgrp_id_t       lgrpid, lgrpid_start;
        lpl_t           *lpl, *bestlpl, *bestrlpl;
        klgrpset_t      lgrpset;
        proc_t          *p;

        ASSERT(t != NULL);
        ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
            THREAD_LOCK_HELD(t));
        ASSERT(cpupart != NULL);

        p = t->t_procp;

        /* A process should always be in an active partition */
        ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));

        bestlpl = bestrlpl = NULL;
        bestload = bestrload = LGRP_LOADAVG_MAX;
        lgrpset = cpupart->cp_lgrpset;

        switch (lgrp_choose_policy) {
        case LGRP_CHOOSE_RR:
                lgrpid = cpupart->cp_lgrp_hint;
                do {
                        if (++lgrpid > lgrp_alloc_max)
                                lgrpid = 0;
                } while (!klgrpset_ismember(lgrpset, lgrpid));

                break;
        default:
        case LGRP_CHOOSE_TIME:
        case LGRP_CHOOSE_RANDOM:
                klgrpset_nlgrps(lgrpset, lgrp_count);
                lgrpid_offset =
                    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
                for (lgrpid = 0; ; lgrpid++) {
                        if (klgrpset_ismember(lgrpset, lgrpid)) {
                                if (--lgrpid_offset == 0)
                                        break;
                        }
                }
                break;
        }

        lgrpid_start = lgrpid;

        DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
            lgrp_id_t, cpupart->cp_lgrp_hint);

        /*
         * Use lgroup affinities (if any) to choose best lgroup
         *
         * NOTE: Assumes that thread is protected from going away and its
         *       lgroup affinities won't change (ie. p_lock, or
         *       thread_lock() being held and/or CPUs paused)
         */
        if (t->t_lgrp_affinity) {
                lpl = lgrp_affinity_best(t, cpupart, lgrpid_start, B_FALSE);
                if (lpl != NULL)
                        return (lpl);
        }

        ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));

        do {
                pgcnt_t npgs;

                /*
                 * Skip any lgroups outside of thread's pset
                 */
                if (!klgrpset_ismember(lgrpset, lgrpid)) {
                        if (++lgrpid > lgrp_alloc_max)
                                lgrpid = 0;     /* wrap the search */
                        continue;
                }

                /*
                 * Skip any non-leaf lgroups
                 */
                if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
                        continue;

                /*
                 * Skip any lgroups without enough free memory
                 * (when threshold set to nonzero positive value)
                 */
                if (lgrp_mem_free_thresh > 0) {
                        npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
                        if (npgs < lgrp_mem_free_thresh) {
                                if (++lgrpid > lgrp_alloc_max)
                                        lgrpid = 0;     /* wrap the search */
                                continue;
                        }
                }

                lpl = &cpupart->cp_lgrploads[lgrpid];
                if (klgrpset_isempty(p->p_lgrpset) ||
                    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
                        /*
                         * Either this is a new process or the process already
                         * has threads on this lgrp, so this is a preferred
                         * lgroup for the thread.
                         */
                        if (bestlpl == NULL ||
                            lpl_pick(lpl, bestlpl)) {
                                bestload = lpl->lpl_loadavg;
                                bestlpl = lpl;
                        }
                } else {
                        /*
                         * The process doesn't have any threads on this lgrp,
                         * but we're willing to consider this lgrp if the load
                         * difference is big enough to justify splitting up
                         * the process' threads.
                         */
                        if (bestrlpl == NULL ||
                            lpl_pick(lpl, bestrlpl)) {
                                bestrload = lpl->lpl_loadavg;
                                bestrlpl = lpl;
                        }
                }
                if (++lgrpid > lgrp_alloc_max)
                        lgrpid = 0;     /* wrap the search */
        } while (lgrpid != lgrpid_start);

        /*
         * Return root lgroup if threshold isn't set to maximum value and
         * lowest lgroup load average more than a certain threshold
         */
        if (lgrp_load_thresh != UINT32_MAX &&
            bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
                return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);

        /*
         * If all the lgroups over which the thread's process is spread are
         * heavily loaded, or otherwise undesirable, we'll consider placing
         * the thread on one of the other leaf lgroups in the thread's
         * partition.
         */
        if ((bestlpl == NULL) ||
            ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
            (bestrload < bestload) &&   /* paranoid about wraparound */
            (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
            bestload))) {
                bestlpl = bestrlpl;
        }

        if (bestlpl == NULL) {
                /*
                 * No lgroup looked particularly good, but we still
                 * have to pick something. Go with the randomly selected
                 * legal lgroup we started with above.
                 */
                bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
        }

        cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
        bestlpl->lpl_homed_time = gethrtime_unscaled();

        ASSERT(bestlpl->lpl_ncpu > 0);
        return (bestlpl);
}

/*
 * Decide if lpl1 is a better candidate than lpl2 for lgrp homing.
 * Returns non-zero if lpl1 is a better candidate, and 0 otherwise.
 */
static int
lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
{
        lgrp_load_t     l1, l2;
        lgrp_load_t     tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);

        l1 = lpl1->lpl_loadavg;
        l2 = lpl2->lpl_loadavg;

        if ((l1 + tolerance < l2) && (l1 < l2)) {
                /* lpl1 is significantly less loaded than lpl2 */
                return (1);
        }

        if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
            l1 + tolerance >= l2 && l1 < l2 &&
            lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
                /*
                 * lpl1's load is within the tolerance of lpl2. We're
                 * willing to consider it be to better however if
                 * it has been longer since we last homed a thread there
                 */
                return (1);
        }

        return (0);
}

/*
 * lgrp_trthr_moves counts the number of times main thread (t_tid = 1) of a
 * process that uses text replication changed home lgrp. This info is used by
 * segvn asyncronous thread to detect if it needs to recheck what lgrps
 * should be used for text replication.
 */
static uint64_t lgrp_trthr_moves = 0;

uint64_t
lgrp_get_trthr_migrations(void)
{
        return (lgrp_trthr_moves);
}

void
lgrp_update_trthr_migrations(uint64_t incr)
{
        atomic_add_64(&lgrp_trthr_moves, incr);
}

/*
 * An LWP is expected to be assigned to an lgroup for at least this long
 * for its anticipatory load to be justified.  NOTE that this value should
 * not be set extremely huge (say, larger than 100 years), to avoid problems
 * with overflow in the calculation that uses it.
 */
#define LGRP_MIN_NSEC   (NANOSEC / 10)          /* 1/10 of a second */
hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;

/*
 * Routine to change a thread's lgroup affiliation.  This routine updates
 * the thread's kthread_t struct and its process' proc_t struct to note the
 * thread's new lgroup affiliation, and its lgroup affinities.
 *
 * Note that this is the only routine that modifies a thread's t_lpl field,
 * and that adds in or removes anticipatory load.
 *
 * If the thread is exiting, newlpl is NULL.
 *
 * Locking:
 * The following lock must be held on entry:
 *      cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
 *              doesn't get removed from t's partition
 *
 * This routine is not allowed to grab any locks, since it may be called
 * with cpus paused (such as from cpu_offline).
 */
void
lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
{
        proc_t          *p;
        lpl_t           *lpl, *oldlpl;
        lgrp_id_t       oldid;
        kthread_t       *tp;
        uint_t          ncpu;
        lgrp_load_t     old, new;

        ASSERT(t);
        ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
            THREAD_LOCK_HELD(t));

        /*
         * If not changing lpls, just return
         */
        if ((oldlpl = t->t_lpl) == newlpl)
                return;

        /*
         * Make sure the thread's lwp hasn't exited (if so, this thread is now
         * associated with process 0 rather than with its original process).
         */
        if (t->t_proc_flag & TP_LWPEXIT) {
                if (newlpl != NULL) {
                        t->t_lpl = newlpl;
                }
                return;
        }

        p = ttoproc(t);

        /*
         * If the thread had a previous lgroup, update its process' p_lgrpset
         * to account for it being moved from its old lgroup.
         */
        if ((oldlpl != NULL) && /* thread had a previous lgroup */
            (p->p_tlist != NULL)) {
                oldid = oldlpl->lpl_lgrpid;

                if (newlpl != NULL)
                        lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);

                if ((do_lgrpset_delete) &&
                    (klgrpset_ismember(p->p_lgrpset, oldid))) {
                        for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
                                /*
                                 * Check if a thread other than the thread
                                 * that's moving is assigned to the same
                                 * lgroup as the thread that's moving.  Note
                                 * that we have to compare lgroup IDs, rather
                                 * than simply comparing t_lpl's, since the
                                 * threads may belong to different partitions
                                 * but be assigned to the same lgroup.
                                 */
                                ASSERT(tp->t_lpl != NULL);

                                if ((tp != t) &&
                                    (tp->t_lpl->lpl_lgrpid == oldid)) {
                                        /*
                                         * Another thread is assigned to the
                                         * same lgroup as the thread that's
                                         * moving, p_lgrpset doesn't change.
                                         */
                                        break;
                                } else if (tp == p->p_tlist) {
                                        /*
                                         * No other thread is assigned to the
                                         * same lgroup as the exiting thread,
                                         * clear the lgroup's bit in p_lgrpset.
                                         */
                                        klgrpset_del(p->p_lgrpset, oldid);
                                        break;
                                }
                        }
                }

                /*
                 * If this thread was assigned to its old lgroup for such a
                 * short amount of time that the anticipatory load that was
                 * added on its behalf has aged very little, remove that
                 * anticipatory load.
                 */
                if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
                    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
                        lpl = oldlpl;
                        for (;;) {
                                do {
                                        old = new = lpl->lpl_loadavg;
                                        new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
                                        if (new > old) {
                                                /*
                                                 * this can happen if the load
                                                 * average was aged since we
                                                 * added in the anticipatory
                                                 * load
                                                 */
                                                new = 0;
                                        }
                                } while (atomic_cas_32(
                                    (lgrp_load_t *)&lpl->lpl_loadavg, old,
                                    new) != old);

                                lpl = lpl->lpl_parent;
                                if (lpl == NULL)
                                        break;

                                ncpu = lpl->lpl_ncpu;
                                ASSERT(ncpu > 0);
                        }
                }
        }
        /*
         * If the thread has a new lgroup (i.e. it's not exiting), update its
         * t_lpl and its process' p_lgrpset, and apply an anticipatory load
         * to its new lgroup to account for its move to its new lgroup.
         */
        if (newlpl != NULL) {
                /*
                 * This thread is moving to a new lgroup
                 */
                t->t_lpl = newlpl;
                if (t->t_tid == 1 && p->p_t1_lgrpid != newlpl->lpl_lgrpid) {
                        p->p_t1_lgrpid = newlpl->lpl_lgrpid;
                        membar_producer();
                        if (p->p_tr_lgrpid != LGRP_NONE &&
                            p->p_tr_lgrpid != p->p_t1_lgrpid) {
                                lgrp_update_trthr_migrations(1);
                        }
                }

                /*
                 * Reflect move in load average of new lgroup
                 * unless it is root lgroup
                 */
                if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
                        return;

                if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
                        klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
                }

                /*
                 * It'll take some time for the load on the new lgroup
                 * to reflect this thread's placement on it.  We'd
                 * like not, however, to have all threads between now
                 * and then also piling on to this lgroup.  To avoid
                 * this pileup, we anticipate the load this thread
                 * will generate on its new lgroup.  The goal is to
                 * make the lgroup's load appear as though the thread
                 * had been there all along.  We're very conservative
                 * in calculating this anticipatory load, we assume
                 * the worst case case (100% CPU-bound thread).  This
                 * may be modified in the future to be more accurate.
                 */
                lpl = newlpl;
                for (;;) {
                        ncpu = lpl->lpl_ncpu;
                        ASSERT(ncpu > 0);
                        do {
                                old = new = lpl->lpl_loadavg;
                                new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
                                /*
                                 * Check for overflow
                                 * Underflow not possible here
                                 */
                                if (new < old)
                                        new = UINT32_MAX;
                        } while (atomic_cas_32((lgrp_load_t *)&lpl->lpl_loadavg,
                            old, new) != old);

                        lpl = lpl->lpl_parent;
                        if (lpl == NULL)
                                break;
                }
                t->t_anttime = gethrtime();
        }
}

/*
 * Return lgroup memory allocation policy given advice from madvise(3C)
 */
lgrp_mem_policy_t
lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
{
        switch (advice) {
        case MADV_ACCESS_LWP:
                return (LGRP_MEM_POLICY_NEXT);
        case MADV_ACCESS_MANY:
                return (LGRP_MEM_POLICY_RANDOM);
        default:
                return (lgrp_mem_policy_default(size, type));
        }
}

/*
 * Figure out default policy
 */
lgrp_mem_policy_t
lgrp_mem_policy_default(size_t size, int type)
{
        cpupart_t               *cp;
        lgrp_mem_policy_t       policy;
        size_t                  pset_mem_size;

        /*
         * Randomly allocate memory across lgroups for shared memory
         * beyond a certain threshold
         */
        if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
            (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
                /*
                 * Get total memory size of current thread's pset
                 */
                kpreempt_disable();
                cp = curthread->t_cpupart;
                klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
                kpreempt_enable();

                /*
                 * Choose policy to randomly allocate memory across
                 * lgroups in pset if it will fit and is not default
                 * partition.  Otherwise, allocate memory randomly
                 * across machine.
                 */
                if (lgrp_mem_pset_aware && size < pset_mem_size)
                        policy = LGRP_MEM_POLICY_RANDOM_PSET;
                else
                        policy = LGRP_MEM_POLICY_RANDOM;
        } else
                /*
                 * Apply default policy for private memory and
                 * shared memory under the respective random
                 * threshold.
                 */
                policy = lgrp_mem_default_policy;

        return (policy);
}

/*
 * Get memory allocation policy for this segment
 */
lgrp_mem_policy_info_t *
lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
{
        lgrp_mem_policy_info_t  *policy_info;
        extern struct seg_ops   segspt_ops;
        extern struct seg_ops   segspt_shmops;

        /*
         * This is for binary compatibility to protect against third party
         * segment drivers which haven't recompiled to allow for
         * SEGOP_GETPOLICY()
         */
        if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
            seg->s_ops != &segspt_shmops)
                return (NULL);

        policy_info = NULL;
        if (seg->s_ops->getpolicy != NULL)
                policy_info = SEGOP_GETPOLICY(seg, vaddr);

        return (policy_info);
}

/*
 * Set policy for allocating private memory given desired policy, policy info,
 * size in bytes of memory that policy is being applied.
 * Return 0 if policy wasn't set already and 1 if policy was set already
 */
int
lgrp_privm_policy_set(lgrp_mem_policy_t policy,
    lgrp_mem_policy_info_t *policy_info, size_t size)
{

        ASSERT(policy_info != NULL);

        if (policy == LGRP_MEM_POLICY_DEFAULT)
                policy = lgrp_mem_policy_default(size, MAP_PRIVATE);

        /*
         * Policy set already?
         */
        if (policy == policy_info->mem_policy)
                return (1);

        /*
         * Set policy
         */
        policy_info->mem_policy = policy;
        policy_info->mem_lgrpid = LGRP_NONE;

        return (0);
}


/*
 * Get shared memory allocation policy with given tree and offset
 */
lgrp_mem_policy_info_t *
lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
    u_offset_t vn_off)
{
        u_offset_t              off;
        lgrp_mem_policy_info_t  *policy_info;
        lgrp_shm_policy_seg_t   *policy_seg;
        lgrp_shm_locality_t     *shm_locality;
        avl_tree_t              *tree;
        avl_index_t             where;

        shm_locality = NULL;
        tree = NULL;
        /*
         * Get policy segment tree from anon_map or vnode and use specified
         * anon index or vnode offset as offset
         *
         * Assume that no lock needs to be held on anon_map or vnode, since
         * they should be protected by their reference count which must be
         * nonzero for an existing segment
         */
        if (amp) {
                ASSERT(amp->refcnt != 0);
                shm_locality = amp->locality;
                if (shm_locality == NULL)
                        return (NULL);
                tree = shm_locality->loc_tree;
                off = ptob(anon_index);
        } else if (vp) {
                shm_locality = vp->v_locality;
                if (shm_locality == NULL)
                        return (NULL);
                ASSERT(shm_locality->loc_count != 0);
                tree = shm_locality->loc_tree;
                off = vn_off;
        }

        if (tree == NULL)
                return (NULL);

        /*
         * Lookup policy segment for offset into shared object and return
         * policy info
         */
        rw_enter(&shm_locality->loc_lock, RW_READER);
        policy_info = NULL;
        policy_seg = avl_find(tree, &off, &where);
        if (policy_seg)
                policy_info = &policy_seg->shm_policy;
        rw_exit(&shm_locality->loc_lock);

        return (policy_info);
}

/*
 * Default memory allocation policy for kernel segmap pages
 */
lgrp_mem_policy_t       lgrp_segmap_default_policy = LGRP_MEM_POLICY_RANDOM;

/*
 * Return lgroup to use for allocating memory
 * given the segment and address
 *
 * There isn't any mutual exclusion that exists between calls
 * to this routine and DR, so this routine and whomever calls it
 * should be mindful of the possibility that the lgrp returned
 * may be deleted. If this happens, dereferences of the lgrp
 * pointer will still be safe, but the resources in the lgrp will
 * be gone, and LGRP_EXISTS() will no longer be true.
 */
lgrp_t *
lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
{
        int                     i;
        lgrp_t                  *lgrp;
        klgrpset_t              lgrpset;
        int                     lgrps_spanned;
        unsigned long           off;
        lgrp_mem_policy_t       policy;
        lgrp_mem_policy_info_t  *policy_info;
        ushort_t                random;
        int                     stat = 0;
        extern struct seg       *segkmap;

        /*
         * Just return null if the lgrp framework hasn't finished
         * initializing or if this is a UMA machine.
         */
        if (nlgrps == 1 || !lgrp_initialized)
                return (lgrp_root);

        /*
         * Get memory allocation policy for this segment
         */
        policy = lgrp_mem_default_policy;
        if (seg != NULL) {
                if (seg->s_as == &kas) {
                        if (seg == segkmap)
                                policy = lgrp_segmap_default_policy;
                        if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
                            policy == LGRP_MEM_POLICY_RANDOM_PSET)
                                policy = LGRP_MEM_POLICY_RANDOM;
                } else {
                        policy_info = lgrp_mem_policy_get(seg, vaddr);
                        if (policy_info != NULL) {
                                policy = policy_info->mem_policy;
                                if (policy == LGRP_MEM_POLICY_NEXT_SEG) {
                                        lgrp_id_t id = policy_info->mem_lgrpid;
                                        ASSERT(id != LGRP_NONE);
                                        ASSERT(id < NLGRPS_MAX);
                                        lgrp = lgrp_table[id];
                                        if (!LGRP_EXISTS(lgrp)) {
                                                policy = LGRP_MEM_POLICY_NEXT;
                                        } else {
                                                lgrp_stat_add(id,
                                                    LGRP_NUM_NEXT_SEG, 1);
                                                return (lgrp);
                                        }
                                }
                        }
                }
        }
        lgrpset = 0;

        /*
         * Initialize lgroup to home by default
         */
        lgrp = lgrp_home_lgrp();

        /*
         * When homing threads on root lgrp, override default memory
         * allocation policies with root lgroup memory allocation policy
         */
        if (lgrp == lgrp_root)
                policy = lgrp_mem_policy_root;

        /*
         * Implement policy
         */
        switch (policy) {
        case LGRP_MEM_POLICY_NEXT_CPU:

                /*
                 * Return lgroup of current CPU which faulted on memory
                 * If the CPU isn't currently in an lgrp, then opt to
                 * allocate from the root.
                 *
                 * Kernel preemption needs to be disabled here to prevent
                 * the current CPU from going away before lgrp is found.
                 */
                if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
                        lgrp = lgrp_root;
                } else {
                        kpreempt_disable();
                        lgrp = lgrp_cpu_to_lgrp(CPU);
                        kpreempt_enable();
                }
                break;

        case LGRP_MEM_POLICY_NEXT:
        case LGRP_MEM_POLICY_DEFAULT:
        default:

                /*
                 * Just return current thread's home lgroup
                 * for default policy (next touch)
                 * If the thread is homed to the root,
                 * then the default policy is random across lgroups.
                 * Fallthrough to the random case.
                 */
                if (lgrp != lgrp_root) {
                        if (policy == LGRP_MEM_POLICY_NEXT)
                                lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
                        else
                                lgrp_stat_add(lgrp->lgrp_id,
                                    LGRP_NUM_DEFAULT, 1);
                        break;
                }
                /* FALLTHROUGH */
        case LGRP_MEM_POLICY_RANDOM:

                /*
                 * Return a random leaf lgroup with memory
                 */
                lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
                /*
                 * Count how many lgroups are spanned
                 */
                klgrpset_nlgrps(lgrpset, lgrps_spanned);

                /*
                 * There may be no memnodes in the root lgroup during DR copy
                 * rename on a system with only two boards (memnodes)
                 * configured. In this case just return the root lgrp.
                 */
                if (lgrps_spanned == 0) {
                        lgrp = lgrp_root;
                        break;
                }

                /*
                 * Pick a random offset within lgroups spanned
                 * and return lgroup at that offset
                 */
                random = (ushort_t)gethrtime() >> 4;
                off = random % lgrps_spanned;
                ASSERT(off <= lgrp_alloc_max);

                for (i = 0; i <= lgrp_alloc_max; i++) {
                        if (!klgrpset_ismember(lgrpset, i))
                                continue;
                        if (off)
                                off--;
                        else {
                                lgrp = lgrp_table[i];
                                lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
                                    1);
                                break;
                        }
                }
                break;

        case LGRP_MEM_POLICY_RANDOM_PROC:

                /*
                 * Grab copy of bitmask of lgroups spanned by
                 * this process
                 */
                klgrpset_copy(lgrpset, curproc->p_lgrpset);
                stat = LGRP_NUM_RANDOM_PROC;

                /* FALLTHROUGH */
        case LGRP_MEM_POLICY_RANDOM_PSET:

                if (!stat)
                        stat = LGRP_NUM_RANDOM_PSET;

                if (klgrpset_isempty(lgrpset)) {
                        /*
                         * Grab copy of bitmask of lgroups spanned by
                         * this processor set
                         */
                        kpreempt_disable();
                        klgrpset_copy(lgrpset,
                            curthread->t_cpupart->cp_lgrpset);
                        kpreempt_enable();
                }

                /*
                 * Count how many lgroups are spanned
                 */
                klgrpset_nlgrps(lgrpset, lgrps_spanned);
                ASSERT(lgrps_spanned <= nlgrps);

                /*
                 * Probably lgrps_spanned should be always non-zero, but to be
                 * on the safe side we return lgrp_root if it is empty.
                 */
                if (lgrps_spanned == 0) {
                        lgrp = lgrp_root;
                        break;
                }

                /*
                 * Pick a random offset within lgroups spanned
                 * and return lgroup at that offset
                 */
                random = (ushort_t)gethrtime() >> 4;
                off = random % lgrps_spanned;
                ASSERT(off <= lgrp_alloc_max);

                for (i = 0; i <= lgrp_alloc_max; i++) {
                        if (!klgrpset_ismember(lgrpset, i))
                                continue;
                        if (off)
                                off--;
                        else {
                                lgrp = lgrp_table[i];
                                lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
                                    1);
                                break;
                        }
                }
                break;

        case LGRP_MEM_POLICY_ROUNDROBIN:

                /*
                 * Use offset within segment to determine
                 * offset from home lgroup to choose for
                 * next lgroup to allocate memory from
                 */
                off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
                    (lgrp_alloc_max + 1);

                kpreempt_disable();
                lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
                i = lgrp->lgrp_id;
                kpreempt_enable();

                while (off > 0) {
                        i = (i + 1) % (lgrp_alloc_max + 1);
                        lgrp = lgrp_table[i];
                        if (klgrpset_ismember(lgrpset, i))
                                off--;
                }
                lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);

                break;
        }

        ASSERT(lgrp != NULL);
        return (lgrp);
}

/*
 * Return the number of pages in an lgroup
 *
 * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
 *       could cause tests that rely on the numat driver to fail....
 */
pgcnt_t
lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
{
        lgrp_t *lgrp;

        lgrp = lgrp_table[lgrpid];
        if (!LGRP_EXISTS(lgrp) ||
            klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
            !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
                return (0);

        return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
}

/*
 * Initialize lgroup shared memory allocation policy support
 */
void
lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
{
        lgrp_shm_locality_t     *shm_locality;

        /*
         * Initialize locality field in anon_map
         * Don't need any locks because this is called when anon_map is
         * allocated, but not used anywhere yet.
         */
        if (amp) {
                ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
                if (amp->locality == NULL) {
                        /*
                         * Allocate and initialize shared memory locality info
                         * and set anon_map locality pointer to it
                         * Drop lock across kmem_alloc(KM_SLEEP)
                         */
                        ANON_LOCK_EXIT(&amp->a_rwlock);
                        shm_locality = kmem_alloc(sizeof (*shm_locality),
                            KM_SLEEP);
                        rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
                            NULL);
                        shm_locality->loc_count = 1;    /* not used for amp */
                        shm_locality->loc_tree = NULL;

                        /*
                         * Reacquire lock and check to see whether anyone beat
                         * us to initializing the locality info
                         */
                        ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
                        if (amp->locality != NULL) {
                                rw_destroy(&shm_locality->loc_lock);
                                kmem_free(shm_locality,
                                    sizeof (*shm_locality));
                        } else
                                amp->locality = shm_locality;
                }
                ANON_LOCK_EXIT(&amp->a_rwlock);
                return;
        }

        /*
         * Allocate shared vnode policy info if vnode is not locality aware yet
         */
        mutex_enter(&vp->v_lock);
        if ((vp->v_flag & V_LOCALITY) == 0) {
                /*
                 * Allocate and initialize shared memory locality info
                 */
                mutex_exit(&vp->v_lock);
                shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
                rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
                shm_locality->loc_count = 1;
                shm_locality->loc_tree = NULL;

                /*
                 * Point vnode locality field at shared vnode policy info
                 * and set locality aware flag in vnode
                 */
                mutex_enter(&vp->v_lock);
                if ((vp->v_flag & V_LOCALITY) == 0) {
                        vp->v_locality = shm_locality;
                        vp->v_flag |= V_LOCALITY;
                } else {
                        /*
                         * Lost race so free locality info and increment count.
                         */
                        rw_destroy(&shm_locality->loc_lock);
                        kmem_free(shm_locality, sizeof (*shm_locality));
                        shm_locality = vp->v_locality;
                        shm_locality->loc_count++;
                }
                mutex_exit(&vp->v_lock);

                return;
        }

        /*
         * Increment reference count of number of segments mapping this vnode
         * shared
         */
        shm_locality = vp->v_locality;
        shm_locality->loc_count++;
        mutex_exit(&vp->v_lock);
}

/*
 * Destroy the given shared memory policy segment tree
 */
void
lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
{
        lgrp_shm_policy_seg_t   *cur;
        lgrp_shm_policy_seg_t   *next;

        if (tree == NULL)
                return;

        cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
        while (cur != NULL) {
                next = AVL_NEXT(tree, cur);
                avl_remove(tree, cur);
                kmem_free(cur, sizeof (*cur));
                cur = next;
        }
        kmem_free(tree, sizeof (avl_tree_t));
}

/*
 * Uninitialize lgroup shared memory allocation policy support
 */
void
lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
{
        lgrp_shm_locality_t     *shm_locality;

        /*
         * For anon_map, deallocate shared memory policy tree and
         * zero locality field
         * Don't need any locks because anon_map is being freed
         */
        if (amp) {
                if (amp->locality == NULL)
                        return;
                shm_locality = amp->locality;
                shm_locality->loc_count = 0;    /* not really used for amp */
                rw_destroy(&shm_locality->loc_lock);
                lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
                kmem_free(shm_locality, sizeof (*shm_locality));
                amp->locality = 0;
                return;
        }

        /*
         * For vnode, decrement reference count of segments mapping this vnode
         * shared and delete locality info if reference count drops to 0
         */
        mutex_enter(&vp->v_lock);
        shm_locality = vp->v_locality;
        shm_locality->loc_count--;

        if (shm_locality->loc_count == 0) {
                rw_destroy(&shm_locality->loc_lock);
                lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
                kmem_free(shm_locality, sizeof (*shm_locality));
                vp->v_locality = 0;
                vp->v_flag &= ~V_LOCALITY;
        }
        mutex_exit(&vp->v_lock);
}

/*
 * Compare two shared memory policy segments
 * Used by AVL tree code for searching
 */
int
lgrp_shm_policy_compar(const void *x, const void *y)
{
        lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
        lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;

        if (a->shm_off < b->shm_off)
                return (-1);
        if (a->shm_off >= b->shm_off + b->shm_size)
                return (1);
        return (0);
}

/*
 * Concatenate seg1 with seg2 and remove seg2
 */
static int
lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
    lgrp_shm_policy_seg_t *seg2)
{
        if (!seg1 || !seg2 ||
            seg1->shm_off + seg1->shm_size != seg2->shm_off ||
            seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
                return (-1);

        seg1->shm_size += seg2->shm_size;
        avl_remove(tree, seg2);
        kmem_free(seg2, sizeof (*seg2));
        return (0);
}

/*
 * Split segment at given offset and return rightmost (uppermost) segment
 * Assumes that there are no overlapping segments
 */
static lgrp_shm_policy_seg_t *
lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
    u_offset_t off)
{
        lgrp_shm_policy_seg_t   *newseg;
        avl_index_t             where;

        ASSERT(seg != NULL && (off >= seg->shm_off &&
            off <= seg->shm_off + seg->shm_size));

        if (!seg || off < seg->shm_off ||
            off > seg->shm_off + seg->shm_size) {
                return (NULL);
        }

        if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
                return (seg);

        /*
         * Adjust size of left segment and allocate new (right) segment
         */
        newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
        newseg->shm_policy = seg->shm_policy;
        newseg->shm_off = off;
        newseg->shm_size = seg->shm_size - (off - seg->shm_off);
        seg->shm_size = off - seg->shm_off;

        /*
         * Find where to insert new segment in AVL tree and insert it
         */
        (void) avl_find(tree, &off, &where);
        avl_insert(tree, newseg, where);

        return (newseg);
}

/*
 * Set shared memory allocation policy on specified shared object at given
 * offset and length
 *
 * Return 0 if policy wasn't set already, 1 if policy was set already, and
 * -1 if can't set policy.
 */
int
lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
    ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
{
        u_offset_t              eoff;
        lgrp_shm_policy_seg_t   *next;
        lgrp_shm_policy_seg_t   *newseg;
        u_offset_t              off;
        u_offset_t              oldeoff;
        lgrp_shm_policy_seg_t   *prev;
        int                     retval;
        lgrp_shm_policy_seg_t   *seg;
        lgrp_shm_locality_t     *shm_locality;
        avl_tree_t              *tree;
        avl_index_t             where;

        ASSERT(amp || vp);
        ASSERT((len & PAGEOFFSET) == 0);

        if (len == 0)
                return (-1);

        retval = 0;

        /*
         * Get locality info and starting offset into shared object
         * Try anon map first and then vnode
         * Assume that no locks need to be held on anon_map or vnode, since
         * it should be protected by its reference count which must be nonzero
         * for an existing segment.
         */
        if (amp) {
                /*
                 * Get policy info from anon_map
                 *
                 */
                ASSERT(amp->refcnt != 0);
                if (amp->locality == NULL)
                        lgrp_shm_policy_init(amp, NULL);
                shm_locality = amp->locality;
                off = ptob(anon_index);
        } else if (vp) {
                /*
                 * Get policy info from vnode
                 */
                if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
                        lgrp_shm_policy_init(NULL, vp);
                shm_locality = vp->v_locality;
                ASSERT(shm_locality->loc_count != 0);
                off = vn_off;
        } else
                return (-1);

        ASSERT((off & PAGEOFFSET) == 0);

        /*
         * Figure out default policy
         */
        if (policy == LGRP_MEM_POLICY_DEFAULT)
                policy = lgrp_mem_policy_default(len, MAP_SHARED);

        /*
         * Create AVL tree if there isn't one yet
         * and set locality field to point at it
         */
        rw_enter(&shm_locality->loc_lock, RW_WRITER);
        tree = shm_locality->loc_tree;
        if (!tree) {
                rw_exit(&shm_locality->loc_lock);

                tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);

                rw_enter(&shm_locality->loc_lock, RW_WRITER);
                if (shm_locality->loc_tree == NULL) {
                        avl_create(tree, lgrp_shm_policy_compar,
                            sizeof (lgrp_shm_policy_seg_t),
                            offsetof(lgrp_shm_policy_seg_t, shm_tree));
                        shm_locality->loc_tree = tree;
                } else {
                        /*
                         * Another thread managed to set up the tree
                         * before we could. Free the tree we allocated
                         * and use the one that's already there.
                         */
                        kmem_free(tree, sizeof (*tree));
                        tree = shm_locality->loc_tree;
                }
        }

        /*
         * Set policy
         *
         * Need to maintain hold on writer's lock to keep tree from
         * changing out from under us
         */
        while (len != 0) {
                /*
                 * Find policy segment for specified offset into shared object
                 */
                seg = avl_find(tree, &off, &where);

                /*
                 * Didn't find any existing segment that contains specified
                 * offset, so allocate new segment, insert it, and concatenate
                 * with adjacent segments if possible
                 */
                if (seg == NULL) {
                        newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
                            KM_SLEEP);
                        newseg->shm_policy.mem_policy = policy;
                        newseg->shm_policy.mem_lgrpid = LGRP_NONE;
                        newseg->shm_off = off;
                        avl_insert(tree, newseg, where);

                        /*
                         * Check to see whether new segment overlaps with next
                         * one, set length of new segment accordingly, and
                         * calculate remaining length and next offset
                         */
                        seg = AVL_NEXT(tree, newseg);
                        if (seg == NULL || off + len <= seg->shm_off) {
                                newseg->shm_size = len;
                                len = 0;
                        } else {
                                newseg->shm_size = seg->shm_off - off;
                                off = seg->shm_off;
                                len -= newseg->shm_size;
                        }

                        /*
                         * Try to concatenate new segment with next and
                         * previous ones, since they might have the same policy
                         * now.  Grab previous and next segments first because
                         * they will change on concatenation.
                         */
                        prev =  AVL_PREV(tree, newseg);
                        next = AVL_NEXT(tree, newseg);
                        (void) lgrp_shm_policy_concat(tree, newseg, next);
                        (void) lgrp_shm_policy_concat(tree, prev, newseg);

                        continue;
                }

                eoff = off + len;
                oldeoff = seg->shm_off + seg->shm_size;

                /*
                 * Policy set already?
                 */
                if (policy == seg->shm_policy.mem_policy) {
                        /*
                         * Nothing left to do if offset and length
                         * fall within this segment
                         */
                        if (eoff <= oldeoff) {
                                retval = 1;
                                break;
                        } else {
                                len = eoff - oldeoff;
                                off = oldeoff;
                                continue;
                        }
                }

                /*
                 * Specified offset and length match existing segment exactly
                 */
                if (off == seg->shm_off && len == seg->shm_size) {
                        /*
                         * Set policy and update current length
                         */
                        seg->shm_policy.mem_policy = policy;
                        seg->shm_policy.mem_lgrpid = LGRP_NONE;
                        len = 0;

                        /*
                         * Try concatenating new segment with previous and next
                         * segments, since they might have the same policy now.
                         * Grab previous and next segments first because they
                         * will change on concatenation.
                         */
                        prev =  AVL_PREV(tree, seg);
                        next = AVL_NEXT(tree, seg);
                        (void) lgrp_shm_policy_concat(tree, seg, next);
                        (void) lgrp_shm_policy_concat(tree, prev, seg);
                } else {
                        /*
                         * Specified offset and length only apply to part of
                         * existing segment
                         */

                        /*
                         * New segment starts in middle of old one, so split
                         * new one off near beginning of old one
                         */
                        newseg = NULL;
                        if (off > seg->shm_off) {
                                newseg = lgrp_shm_policy_split(tree, seg, off);

                                /*
                                 * New segment ends where old one did, so try
                                 * to concatenate with next segment
                                 */
                                if (eoff == oldeoff) {
                                        newseg->shm_policy.mem_policy = policy;
                                        newseg->shm_policy.mem_lgrpid =
                                            LGRP_NONE;
                                        (void) lgrp_shm_policy_concat(tree,
                                            newseg, AVL_NEXT(tree, newseg));
                                        break;
                                }
                        }

                        /*
                         * New segment ends before old one, so split off end of
                         * old one
                         */
                        if (eoff < oldeoff) {
                                if (newseg) {
                                        (void) lgrp_shm_policy_split(tree,
                                            newseg, eoff);
                                        newseg->shm_policy.mem_policy = policy;
                                        newseg->shm_policy.mem_lgrpid =
                                            LGRP_NONE;
                                } else {
                                        (void) lgrp_shm_policy_split(tree, seg,
                                            eoff);
                                        seg->shm_policy.mem_policy = policy;
                                        seg->shm_policy.mem_lgrpid = LGRP_NONE;
                                }

                                if (off == seg->shm_off)
                                        (void) lgrp_shm_policy_concat(tree,
                                            AVL_PREV(tree, seg), seg);
                                break;
                        }

                        /*
                         * Calculate remaining length and next offset
                         */
                        len = eoff - oldeoff;
                        off = oldeoff;
                }
        }

        rw_exit(&shm_locality->loc_lock);
        return (retval);
}

/*
 * Return the best memnode from which to allocate memory given
 * an lgroup.
 *
 * "c" is for cookie, which is good enough for me.
 * It references a cookie struct that should be zero'ed to initialize.
 * The cookie should live on the caller's stack.
 *
 * The routine returns -1 when:
 *      - traverse is 0, and all the memnodes in "lgrp" have been returned.
 *      - traverse is 1, and all the memnodes in the system have been
 *        returned.
 */
int
lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
{
        lgrp_t          *lp = c->lmc_lgrp;
        mnodeset_t      nodes = c->lmc_nodes;
        int             cnt = c->lmc_cnt;
        int             offset, mnode;

        extern int      max_mem_nodes;

        /*
         * If the set is empty, and the caller is willing, traverse
         * up the hierarchy until we find a non-empty set.
         */
        while (nodes == (mnodeset_t)0 || cnt <= 0) {
                if (c->lmc_scope == LGRP_SRCH_LOCAL ||
                    ((lp = lp->lgrp_parent) == NULL))
                        return (-1);

                nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
                cnt = lp->lgrp_nmnodes - c->lmc_ntried;
        }

        /*
         * Select a memnode by picking one at a "random" offset.
         * Because of DR, memnodes can come and go at any time.
         * This code must be able to cope with the possibility
         * that the nodes count "cnt" is inconsistent with respect
         * to the number of elements actually in "nodes", and
         * therefore that the offset chosen could be greater than
         * the number of elements in the set (some memnodes may
         * have dissapeared just before cnt was read).
         * If this happens, the search simply wraps back to the
         * beginning of the set.
         */
        ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
        offset = c->lmc_rand % cnt;
        do {
                for (mnode = 0; mnode < max_mem_nodes; mnode++)
                        if (nodes & ((mnodeset_t)1 << mnode))
                                if (!offset--)
                                        break;
        } while (mnode >= max_mem_nodes);

        /* Found a node. Store state before returning. */
        c->lmc_lgrp = lp;
        c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
        c->lmc_cnt = cnt - 1;
        c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
        c->lmc_ntried++;

        return (mnode);
}
Illumos