root/usr/src/uts/common/os/pool_pset.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License, Version 1.0 only
 * (the "License").  You may not use this file except in compliance
 * with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

/*
 * Copyright 2019 Joyent, Inc.
 */

#include <sys/pool.h>
#include <sys/pool_impl.h>
#include <sys/pool_pset.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/mutex.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/cmn_err.h>
#include <sys/fss.h>
#include <sys/exacct.h>
#include <sys/time.h>
#include <sys/policy.h>
#include <sys/class.h>
#include <sys/list.h>
#include <sys/cred.h>
#include <sys/zone.h>

/*
 * Processor set plugin for pools.
 *
 * This file contains various routines used by the common pools layer to create,
 * modify, and destroy processor sets.  All processor sets created by this
 * plug-in are stored in the pool_pset_list doubly-linked list, which is
 * guaranteed to always have an entry for the default processor set,
 * pool_pset_default.
 *
 * Interaction with zones:
 *
 * If pools are enabled, non-global zones only have visibility into the
 * pset of the pool to which they are bound.  This is accomplished by
 * changing the set of processors and processor sets which are visible
 * through both systemcall interfaces and system kstats.
 *
 * To avoid grabbing pool_lock() during cpu change operations, we cache
 * the pset the zone is currently bound to, and can read this value
 * while under cpu_lock.  The special psetid_t token ZONE_PS_INVAL means
 * that pools are disabled, and provides a mechanism for determining if the
 * status of pools without grabbing pool_lock().
 *
 * To avoid grabbing any locks to determine the instantaneous value of
 * the number of configured and online cpus in the zone, we also cache
 * these values in a zone_t.  If these values are zero, the pools
 * facility must be disabled, in which case relevant systemcall
 * interfaces will return the values for the system as a whole.
 *
 * The various kstat interfaces are dealt with as follows: if pools are
 * disabled all cpu-related kstats should be exported to all zones.
 * When pools are enabled we begin maintaining a list of "permitted
 * zones" on a per-kstat basis.  There are various hooks throughout the
 * code to update this list when certain pools- or cpu-related events
 * occur.
 */

static list_t pool_pset_list;   /* doubly-linked list of psets */
pool_pset_t *pool_pset_default; /* default pset */
hrtime_t pool_pset_mod;         /* last modification time for psets */
hrtime_t pool_cpu_mod;          /* last modification time for CPUs */

static pool_pset_t *
pool_lookup_pset_by_id(psetid_t psetid)
{
        pool_pset_t *pset = pool_pset_default;

        ASSERT(pool_lock_held());

        for (pset = list_head(&pool_pset_list); pset;
            pset = list_next(&pool_pset_list, pset)) {
                if (pset->pset_id == psetid)
                        return (pset);
        }
        return (NULL);
}

struct setup_arg {
        psetid_t psetid;
        cpu_t *cpu;
        cpu_setup_t what;
};

/*
 * Callback function used to apply a cpu configuration event to a zone.
 */
static int
pool_pset_setup_cb(zone_t *zone, void *arg)
{
        struct setup_arg *sa = arg;

        ASSERT(MUTEX_HELD(&cpu_lock));
        ASSERT(INGLOBALZONE(curproc));
        ASSERT(zone != NULL);

        if (zone == global_zone)
                return (0);
        if (zone_pset_get(zone) != sa->psetid)
                return (0);     /* ignore */
        switch (sa->what) {
        case CPU_CONFIG:
                cpu_visibility_configure(sa->cpu, zone);
                break;
        case CPU_UNCONFIG:
                cpu_visibility_unconfigure(sa->cpu, zone);
                break;
        case CPU_ON:
                cpu_visibility_online(sa->cpu, zone);
                break;
        case CPU_OFF:
                cpu_visibility_offline(sa->cpu, zone);
                break;
        case CPU_CPUPART_IN:
                cpu_visibility_add(sa->cpu, zone);
                break;
        case CPU_CPUPART_OUT:
                cpu_visibility_remove(sa->cpu, zone);
                break;
        default:
                cmn_err(CE_PANIC, "invalid cpu_setup_t value %d", sa->what);
        }
        return (0);
}

/*
 * Callback function to be executed when a noteworthy cpu event takes
 * place.  Will ensure that the event is reflected by the zones which
 * were affected by it.
 */
/* ARGSUSED */
static int
pool_pset_cpu_setup(cpu_setup_t what, int id, void *arg)
{
        processorid_t cpuid = id;
        struct setup_arg sarg;
        int error;
        cpu_t *c;

        ASSERT(MUTEX_HELD(&cpu_lock));
        ASSERT(INGLOBALZONE(curproc));

        if (!pool_pset_enabled())
                return (0);
        if (what != CPU_CONFIG && what != CPU_UNCONFIG &&
            what != CPU_ON && what != CPU_OFF &&
            what != CPU_CPUPART_IN && what != CPU_CPUPART_OUT)
                return (0);
        c = cpu_get(cpuid);
        ASSERT(c != NULL);
        sarg.psetid = cpupart_query_cpu(c);
        sarg.cpu = c;
        sarg.what = what;

        error = zone_walk(pool_pset_setup_cb, &sarg);
        ASSERT(error == 0);
        return (0);
}

/*
 * Initialize processor set plugin.  Called once at boot time.
 */
void
pool_pset_init(void)
{
        ASSERT(pool_pset_default == NULL);
        pool_pset_default = kmem_zalloc(sizeof (pool_pset_t), KM_SLEEP);
        pool_pset_default->pset_id = PS_NONE;
        pool_pset_default->pset_npools = 1;     /* for pool_default */
        pool_default->pool_pset = pool_pset_default;
        list_create(&pool_pset_list, sizeof (pool_pset_t),
            offsetof(pool_pset_t, pset_link));
        list_insert_head(&pool_pset_list, pool_pset_default);
        mutex_enter(&cpu_lock);
        register_cpu_setup_func(pool_pset_cpu_setup, NULL);
        mutex_exit(&cpu_lock);
}

/*
 * Dummy wrapper function that returns 0 to satisfy zone_walk().
 */
static int
pool_pset_zone_pset_set(zone_t *zone, void *arg)
{
        psetid_t psetid = (psetid_t)(uintptr_t)arg;

        ASSERT(MUTEX_HELD(&cpu_lock));
        zone_pset_set(zone, psetid);
        return (0);
}

/*
 * Enable processor set plugin.
 */
int
pool_pset_enable(void)
{
        int error;
        nvlist_t *props;

        ASSERT(pool_lock_held());
        ASSERT(INGLOBALZONE(curproc));
        /*
         * Can't enable pools if there are existing cpu partitions.
         */
        mutex_enter(&cpu_lock);
        if (cp_numparts > 1) {
                mutex_exit(&cpu_lock);
                return (EEXIST);
        }

        /*
         * We want to switch things such that everything that was tagged with
         * the special ALL_ZONES token now is explicitly visible to all zones:
         * first add individual zones to the visibility list then remove the
         * special "ALL_ZONES" token.  There must only be the default pset
         * (PS_NONE) active if pools are being enabled, so we only need to
         * deal with it.
         *
         * We want to make pool_pset_enabled() start returning B_TRUE before
         * we call any of the visibility update functions.
         */
        global_zone->zone_psetid = PS_NONE;
        /*
         * We need to explicitly handle the global zone since
         * zone_pset_set() won't modify it.
         */
        pool_pset_visibility_add(PS_NONE, global_zone);
        /*
         * A NULL argument means the ALL_ZONES token.
         */
        pool_pset_visibility_remove(PS_NONE, NULL);
        error = zone_walk(pool_pset_zone_pset_set, (void *)PS_NONE);
        ASSERT(error == 0);

        /*
         * It is safe to drop cpu_lock here.  We're still
         * holding pool_lock so no new cpu partitions can
         * be created while we're here.
         */
        mutex_exit(&cpu_lock);
        (void) nvlist_alloc(&pool_pset_default->pset_props,
            NV_UNIQUE_NAME, KM_SLEEP);
        props = pool_pset_default->pset_props;
        (void) nvlist_add_string(props, "pset.name", "pset_default");
        (void) nvlist_add_string(props, "pset.comment", "");
        (void) nvlist_add_int64(props, "pset.sys_id", PS_NONE);
        (void) nvlist_add_string(props, "pset.units", "population");
        (void) nvlist_add_byte(props, "pset.default", 1);
        (void) nvlist_add_uint64(props, "pset.max", 65536);
        (void) nvlist_add_uint64(props, "pset.min", 1);
        pool_pset_mod = pool_cpu_mod = gethrtime();
        return (0);
}

/*
 * Disable processor set plugin.
 */
int
pool_pset_disable(void)
{
        processorid_t cpuid;
        cpu_t *cpu;
        int error;

        ASSERT(pool_lock_held());
        ASSERT(INGLOBALZONE(curproc));

        mutex_enter(&cpu_lock);
        if (cp_numparts > 1) {  /* make sure only default pset is left */
                mutex_exit(&cpu_lock);
                return (EBUSY);
        }
        /*
         * Remove all non-system CPU and processor set properties
         */
        for (cpuid = 0; cpuid < NCPU; cpuid++) {
                if ((cpu = cpu_get(cpuid)) == NULL)
                        continue;
                if (cpu->cpu_props != NULL) {
                        (void) nvlist_free(cpu->cpu_props);
                        cpu->cpu_props = NULL;
                }
        }

        /*
         * We want to switch things such that everything is now visible
         * to ALL_ZONES: first add the special "ALL_ZONES" token to the
         * visibility list then remove individual zones.  There must
         * only be the default pset active if pools are being disabled,
         * so we only need to deal with it.
         */
        error = zone_walk(pool_pset_zone_pset_set, (void *)ZONE_PS_INVAL);
        ASSERT(error == 0);
        pool_pset_visibility_add(PS_NONE, NULL);
        pool_pset_visibility_remove(PS_NONE, global_zone);
        /*
         * pool_pset_enabled() will henceforth return B_FALSE.
         */
        global_zone->zone_psetid = ZONE_PS_INVAL;
        mutex_exit(&cpu_lock);
        if (pool_pset_default->pset_props != NULL) {
                nvlist_free(pool_pset_default->pset_props);
                pool_pset_default->pset_props = NULL;
        }
        return (0);
}

/*
 * Create new processor set and give it a temporary name.
 */
int
pool_pset_create(psetid_t *id)
{
        char pset_name[40];
        pool_pset_t *pset;
        psetid_t psetid;
        int err;

        ASSERT(pool_lock_held());
        if ((err = cpupart_create(&psetid)) != 0)
                return (err);
        pset = kmem_alloc(sizeof (pool_pset_t), KM_SLEEP);
        pset->pset_id = *id = psetid;
        pset->pset_npools = 0;
        (void) nvlist_alloc(&pset->pset_props, NV_UNIQUE_NAME, KM_SLEEP);
        (void) nvlist_add_int64(pset->pset_props, "pset.sys_id", psetid);
        (void) nvlist_add_byte(pset->pset_props, "pset.default", 0);
        pool_pset_mod = gethrtime();
        (void) snprintf(pset_name, sizeof (pset_name), "pset_%lld",
            pool_pset_mod);
        (void) nvlist_add_string(pset->pset_props, "pset.name", pset_name);
        list_insert_tail(&pool_pset_list, pset);
        return (0);
}

/*
 * Destroy existing processor set.
 */
int
pool_pset_destroy(psetid_t psetid)
{
        pool_pset_t *pset;
        int ret;

        ASSERT(pool_lock_held());

        if (psetid == PS_NONE)
                return (EINVAL);
        if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
                return (ESRCH);
        if (pset->pset_npools > 0) /* can't destroy associated psets */
                return (EBUSY);
        if ((ret = cpupart_destroy(pset->pset_id)) != 0)
                return (ret);
        (void) nvlist_free(pset->pset_props);
        list_remove(&pool_pset_list, pset);
        pool_pset_mod = gethrtime();
        kmem_free(pset, sizeof (pool_pset_t));
        return (0);
}

/*
 * Change the visibility of a pset (and all contained cpus) in a zone.
 * A NULL zone argument implies the special ALL_ZONES token.
 */
static void
pool_pset_visibility_change(psetid_t psetid, zone_t *zone, boolean_t add)
{
        zoneid_t zoneid = zone ? zone->zone_id : ALL_ZONES;
        cpupart_t *cp;
        cpu_t *c;

        ASSERT(MUTEX_HELD(&cpu_lock));
        ASSERT(psetid != ZONE_PS_INVAL);

        cp = cpupart_find(psetid);
        ASSERT(cp != NULL);
        if (cp->cp_kstat != NULL) {
                if (add)
                        kstat_zone_add(cp->cp_kstat, zoneid);
                else
                        kstat_zone_remove(cp->cp_kstat, zoneid);
        }

        c = cpu_list;
        do {
                ASSERT(c != NULL);
                if (c->cpu_part == cp && !cpu_is_poweredoff(c)) {
                        if (add)
                                cpu_visibility_add(c, zone);
                        else
                                cpu_visibility_remove(c, zone);
                }
        } while ((c = c->cpu_next) != cpu_list);
}

/*
 * Make the processor set visible to the zone.  A NULL value for
 * the zone means that the special ALL_ZONES token should be added to
 * the visibility list.
 */
void
pool_pset_visibility_add(psetid_t psetid, zone_t *zone)
{
        pool_pset_visibility_change(psetid, zone, B_TRUE);
}

/*
 * Remove zone's visibility into the processor set.  A NULL value for
 * the zone means that the special ALL_ZONES token should be removed
 * from the visibility list.
 */
void
pool_pset_visibility_remove(psetid_t psetid, zone_t *zone)
{
        pool_pset_visibility_change(psetid, zone, B_FALSE);
}

/*
 * Quick way of seeing if pools are enabled (as far as processor sets are
 * concerned) without holding pool_lock().
 */
boolean_t
pool_pset_enabled(void)
{
        ASSERT(MUTEX_HELD(&cpu_lock));

        return (zone_pset_get(global_zone) != ZONE_PS_INVAL);
}

struct assoc_zone_arg {
        poolid_t poolid;
        psetid_t newpsetid;
};

/*
 * Callback function to update a zone's processor set visibility when
 * a pool is associated with a processor set.
 */
static int
pool_pset_assoc_zone_cb(zone_t *zone, void *arg)
{
        struct assoc_zone_arg *aza = arg;
        pool_t *pool;
        zoneid_t zoneid = zone->zone_id;

        ASSERT(pool_lock_held());
        ASSERT(MUTEX_HELD(&cpu_lock));

        if (zoneid == GLOBAL_ZONEID)
                return (0);
        pool = zone_pool_get(zone);
        if (pool->pool_id == aza->poolid)
                zone_pset_set(zone, aza->newpsetid);
        return (0);
}

/*
 * Associate pool with new processor set.
 */
int
pool_pset_assoc(poolid_t poolid, psetid_t psetid)
{
        pool_t *pool;
        pool_pset_t *pset, *oldpset;
        int err = 0;

        ASSERT(pool_lock_held());

        if ((pool = pool_lookup_pool_by_id(poolid)) == NULL ||
            (pset = pool_lookup_pset_by_id(psetid)) == NULL) {
                return (ESRCH);
        }
        if (pool->pool_pset->pset_id == psetid) {
                /*
                 * Already associated.
                 */
                return (0);
        }

        /*
         * Hang the new pset off the pool, and rebind all of the pool's
         * processes to it.  If pool_do_bind fails, all processes will remain
         * bound to the old set.
         */
        oldpset = pool->pool_pset;
        pool->pool_pset = pset;
        err = pool_do_bind(pool, P_POOLID, poolid, POOL_BIND_PSET);
        if (err) {
                pool->pool_pset = oldpset;
        } else {
                struct assoc_zone_arg azarg;

                /*
                 * Update zones' visibility to reflect changes.
                 */
                azarg.poolid = poolid;
                azarg.newpsetid = pset->pset_id;
                mutex_enter(&cpu_lock);
                err = zone_walk(pool_pset_assoc_zone_cb, &azarg);
                ASSERT(err == 0);
                mutex_exit(&cpu_lock);

                oldpset->pset_npools--;
                pset->pset_npools++;
        }
        return (err);
}

/*
 * Transfer specified CPUs between processor sets.
 */
int
pool_pset_xtransfer(psetid_t src, psetid_t dst, size_t size, id_t *ids)
{
        struct cpu *cpu;
        int ret = 0;
        int id;

        ASSERT(pool_lock_held());
        ASSERT(INGLOBALZONE(curproc));

        if (size == 0 || size > max_ncpus)      /* quick sanity check */
                return (EINVAL);

        mutex_enter(&cpu_lock);
        for (id = 0; id < size; id++) {
                if ((cpu = cpu_get((processorid_t)ids[id])) == NULL ||
                    cpupart_query_cpu(cpu) != src) {
                        ret = EINVAL;
                        break;
                }
                if ((ret = cpupart_attach_cpu(dst, cpu, 1)) != 0)
                        break;
        }
        mutex_exit(&cpu_lock);
        if (ret == 0)
                pool_pset_mod = gethrtime();
        return (ret);
}

/*
 * Bind process to processor set.  This should never fail because
 * we should've done all preliminary checks before calling it.
 */
void
pool_pset_bind(proc_t *p, psetid_t psetid, void *projbuf, void *zonebuf)
{
        kthread_t *t;
        int ret;

        ASSERT(pool_lock_held());
        ASSERT(MUTEX_HELD(&cpu_lock));
        ASSERT(MUTEX_HELD(&pidlock));
        ASSERT(MUTEX_HELD(&p->p_lock));

        if ((t = p->p_tlist) == NULL)
                return;
        do {
                ret = cpupart_bind_thread(t, psetid, 0, projbuf, zonebuf);
                ASSERT(ret == 0);
                t->t_bind_pset = psetid;
        } while ((t = t->t_forw) != p->p_tlist);
}

/*
 * See the comment above pool_do_bind() for the semantics of the pset_bind_*()
 * functions.  These must be kept in sync with cpupart_move_thread, and
 * anything else that could fail a pool_pset_bind.
 *
 * Returns non-zero errno on failure and zero on success.
 * Iff successful, cpu_lock is held on return.
 */
int
pset_bind_start(proc_t **procs, pool_t *pool)
{
        cred_t *pcred;
        proc_t *p, **pp;
        kthread_t *t;
        cpupart_t *newpp;
        int ret;

        extern int cpupart_movable_thread(kthread_id_t, cpupart_t *, int);

        ASSERT(pool_lock_held());
        ASSERT(INGLOBALZONE(curproc));

        mutex_enter(&cpu_lock);
        weakbinding_stop();

        newpp = cpupart_find(pool->pool_pset->pset_id);
        ASSERT(newpp != NULL);
        if (newpp->cp_cpulist == NULL) {
                weakbinding_start();
                mutex_exit(&cpu_lock);
                return (ENOTSUP);
        }

        pcred = crgetcred();

        /*
         * Check for the PRIV_PROC_PRIOCNTL privilege that is required
         * to enter and exit scheduling classes.  If other privileges
         * are required by CL_ENTERCLASS/CL_CANEXIT types of routines
         * in the future, this code will have to be updated.
         */
        if (secpolicy_setpriority(pcred) != 0) {
                weakbinding_start();
                mutex_exit(&cpu_lock);
                crfree(pcred);
                return (EPERM);
        }

        for (pp = procs; (p = *pp) != NULL; pp++) {
                mutex_enter(&p->p_lock);
                if ((t = p->p_tlist) == NULL) {
                        mutex_exit(&p->p_lock);
                        continue;
                }
                /*
                 * Check our basic permissions to control this process.
                 */
                if (!prochasprocperm(p, curproc, pcred)) {
                        mutex_exit(&p->p_lock);
                        weakbinding_start();
                        mutex_exit(&cpu_lock);
                        crfree(pcred);
                        return (EPERM);
                }
                do {
                        /*
                         * Check that all threads can be moved to
                         * a new processor set.
                         */
                        thread_lock(t);
                        ret = cpupart_movable_thread(t, newpp, 0);
                        thread_unlock(t);
                        if (ret != 0) {
                                mutex_exit(&p->p_lock);
                                weakbinding_start();
                                mutex_exit(&cpu_lock);
                                crfree(pcred);
                                return (ret);
                        }
                } while ((t = t->t_forw) != p->p_tlist);
                mutex_exit(&p->p_lock);
        }
        crfree(pcred);
        return (0);     /* with cpu_lock held and weakbinding stopped */
}

/*ARGSUSED*/
void
pset_bind_abort(proc_t **procs, pool_t *pool)
{
        mutex_exit(&cpu_lock);
}

void
pset_bind_finish(void)
{
        weakbinding_start();
        mutex_exit(&cpu_lock);
}

static pool_property_t pool_pset_props[] = {
        { "pset.name",                  DATA_TYPE_STRING,       PP_RDWR },
        { "pset.comment",               DATA_TYPE_STRING,       PP_RDWR },
        { "pset.sys_id",                DATA_TYPE_UINT64,       PP_READ },
        { "pset.units",                 DATA_TYPE_STRING,       PP_RDWR },
        { "pset.default",               DATA_TYPE_BYTE,         PP_READ },
        { "pset.min",                   DATA_TYPE_UINT64,       PP_RDWR },
        { "pset.max",                   DATA_TYPE_UINT64,       PP_RDWR },
        { "pset.size",                  DATA_TYPE_UINT64,       PP_READ },
        { "pset.load",                  DATA_TYPE_UINT64,       PP_READ },
        { "pset.poold.objectives",      DATA_TYPE_STRING,
            PP_RDWR | PP_OPTIONAL },
        { NULL,                         0,                      0 }
};

static pool_property_t pool_cpu_props[] = {
        { "cpu.sys_id",                 DATA_TYPE_UINT64,       PP_READ },
        { "cpu.comment",                DATA_TYPE_STRING,       PP_RDWR },
        { "cpu.status",                 DATA_TYPE_STRING,       PP_RDWR },
        { "cpu.pinned",                 DATA_TYPE_BYTE,
            PP_RDWR | PP_OPTIONAL },
        { NULL,                         0,                      0 }
};

/*
 * Put property on the specified processor set.
 */
int
pool_pset_propput(psetid_t psetid, nvpair_t *pair)
{
        pool_pset_t *pset;
        int ret;

        ASSERT(pool_lock_held());

        if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
                return (ESRCH);
        ret = pool_propput_common(pset->pset_props, pair, pool_pset_props);
        if (ret == 0)
                pool_pset_mod = gethrtime();
        return (ret);
}

/*
 * Remove existing processor set property.
 */
int
pool_pset_proprm(psetid_t psetid, char *name)
{
        pool_pset_t *pset;
        int ret;

        ASSERT(pool_lock_held());

        if ((pset = pool_lookup_pset_by_id(psetid)) == NULL)
                return (EINVAL);
        ret = pool_proprm_common(pset->pset_props, name, pool_pset_props);
        if (ret == 0)
                pool_pset_mod = gethrtime();
        return (ret);
}

/*
 * Put new CPU property.
 * Handle special case of "cpu.status".
 */
int
pool_cpu_propput(processorid_t cpuid, nvpair_t *pair)
{
        int ret = 0;
        cpu_t *cpu;

        ASSERT(pool_lock_held());
        ASSERT(INGLOBALZONE(curproc));

        if (nvpair_type(pair) == DATA_TYPE_STRING &&
            strcmp(nvpair_name(pair), "cpu.status") == 0) {
                char *val;
                int status;
                int old_status;
                (void) nvpair_value_string(pair, &val);
                if (strcmp(val, PS_OFFLINE) == 0)
                        status = P_OFFLINE;
                else if (strcmp(val, PS_ONLINE) == 0)
                        status = P_ONLINE;
                else if (strcmp(val, PS_NOINTR) == 0)
                        status = P_NOINTR;
                else if (strcmp(val, PS_FAULTED) == 0)
                        status = P_FAULTED;
                else if (strcmp(val, PS_SPARE) == 0)
                        status = P_SPARE;
                else
                        return (EINVAL);
                ret = p_online_internal(cpuid, status, &old_status);
        } else {
                mutex_enter(&cpu_lock);
                if ((cpu = cpu_get(cpuid)) == NULL)
                        ret = EINVAL;
                if (cpu->cpu_props == NULL) {
                        (void) nvlist_alloc(&cpu->cpu_props,
                            NV_UNIQUE_NAME, KM_SLEEP);
                        (void) nvlist_add_string(cpu->cpu_props,
                            "cpu.comment", "");
                }
                ret = pool_propput_common(cpu->cpu_props, pair, pool_cpu_props);
                if (ret == 0)
                        pool_cpu_mod = gethrtime();
                mutex_exit(&cpu_lock);
        }
        return (ret);
}

/*
 * Remove existing CPU property.
 */
int
pool_cpu_proprm(processorid_t cpuid, char *name)
{
        int ret;
        cpu_t *cpu;

        ASSERT(pool_lock_held());
        ASSERT(INGLOBALZONE(curproc));

        mutex_enter(&cpu_lock);
        if ((cpu = cpu_get(cpuid)) == NULL || cpu_is_poweredoff(cpu)) {
                ret = EINVAL;
        } else {
                if (cpu->cpu_props == NULL)
                        ret = EINVAL;
                else
                        ret = pool_proprm_common(cpu->cpu_props, name,
                            pool_cpu_props);
        }
        if (ret == 0)
                pool_cpu_mod = gethrtime();
        mutex_exit(&cpu_lock);
        return (ret);
}

/*
 * This macro returns load average multiplied by 1000 w/o losing precision
 */
#define PSET_LOAD(f)    (((f >> 16) * 1000) + (((f & 0xffff) * 1000) / 0xffff))

/*
 * Take a snapshot of the current state of processor sets and CPUs,
 * pack it in the exacct format, and attach it to specified exacct record.
 */
int
pool_pset_pack(ea_object_t *eo_system)
{
        ea_object_t *eo_pset, *eo_cpu;
        cpupart_t *cpupart;
        psetid_t mypsetid;
        pool_pset_t *pset;
        nvlist_t *nvl;
        size_t bufsz;
        cpu_t *cpu;
        char *buf;
        int ncpu;

        ASSERT(pool_lock_held());

        mutex_enter(&cpu_lock);
        mypsetid = zone_pset_get(curproc->p_zone);
        for (pset = list_head(&pool_pset_list); pset;
            pset = list_next(&pool_pset_list, pset)) {
                psetid_t psetid = pset->pset_id;

                if (!INGLOBALZONE(curproc) && mypsetid != psetid)
                        continue;
                cpupart = cpupart_find(psetid);
                ASSERT(cpupart != NULL);
                eo_pset = ea_alloc_group(EXT_GROUP |
                    EXC_LOCAL | EXD_GROUP_PSET);
                (void) ea_attach_item(eo_pset, &psetid, sizeof (id_t),
                    EXC_LOCAL | EXD_PSET_PSETID | EXT_UINT32);
                /*
                 * Pack info for all CPUs in this processor set.
                 */
                ncpu = 0;
                cpu = cpu_list;
                do {
                        if (cpu->cpu_part != cpupart)   /* not our pset */
                                continue;
                        ncpu++;
                        eo_cpu = ea_alloc_group(EXT_GROUP
                            | EXC_LOCAL | EXD_GROUP_CPU);
                        (void) ea_attach_item(eo_cpu, &cpu->cpu_id,
                            sizeof (processorid_t),
                            EXC_LOCAL | EXD_CPU_CPUID | EXT_UINT32);
                        if (cpu->cpu_props == NULL) {
                                (void) nvlist_alloc(&cpu->cpu_props,
                                    NV_UNIQUE_NAME, KM_SLEEP);
                                (void) nvlist_add_string(cpu->cpu_props,
                                    "cpu.comment", "");
                        }
                        (void) nvlist_dup(cpu->cpu_props, &nvl, KM_SLEEP);
                        (void) nvlist_add_int64(nvl, "cpu.sys_id", cpu->cpu_id);
                        (void) nvlist_add_string(nvl, "cpu.status",
                            (char *)cpu_get_state_str(cpu->cpu_flags));
                        buf = NULL;
                        bufsz = 0;
                        (void) nvlist_pack(nvl, &buf, &bufsz,
                            NV_ENCODE_NATIVE, 0);
                        (void) ea_attach_item(eo_cpu, buf, bufsz,
                            EXC_LOCAL | EXD_CPU_PROP | EXT_RAW);
                        (void) nvlist_free(nvl);
                        kmem_free(buf, bufsz);
                        (void) ea_attach_to_group(eo_pset, eo_cpu);
                } while ((cpu = cpu->cpu_next) != cpu_list);

                (void) nvlist_dup(pset->pset_props, &nvl, KM_SLEEP);
                (void) nvlist_add_uint64(nvl, "pset.size", ncpu);
                (void) nvlist_add_uint64(nvl, "pset.load",
                    (uint64_t)PSET_LOAD(cpupart->cp_hp_avenrun[0]));
                buf = NULL;
                bufsz = 0;
                (void) nvlist_pack(nvl, &buf, &bufsz, NV_ENCODE_NATIVE, 0);
                (void) ea_attach_item(eo_pset, buf, bufsz,
                    EXC_LOCAL | EXD_PSET_PROP | EXT_RAW);
                (void) nvlist_free(nvl);
                kmem_free(buf, bufsz);

                (void) ea_attach_to_group(eo_system, eo_pset);
        }
        mutex_exit(&cpu_lock);
        return (0);
}

/*
 * Get dynamic property for processor sets.
 * The only dynamic property currently implemented is "pset.load".
 */
int
pool_pset_propget(psetid_t psetid, char *name, nvlist_t *nvl)
{
        cpupart_t *cpupart;
        pool_pset_t *pset;
        int ret = ESRCH;

        ASSERT(pool_lock_held());

        mutex_enter(&cpu_lock);
        pset = pool_lookup_pset_by_id(psetid);
        cpupart = cpupart_find(psetid);
        if (cpupart == NULL || pset == NULL) {
                mutex_exit(&cpu_lock);
                return (EINVAL);
        }
        if (strcmp(name, "pset.load") == 0)
                ret = nvlist_add_uint64(nvl, "pset.load",
                    (uint64_t)PSET_LOAD(cpupart->cp_hp_avenrun[0]));
        else
                ret = EINVAL;
        mutex_exit(&cpu_lock);
        return (ret);
}

/*
 * Get dynamic property for CPUs.
 * The only dynamic property currently implemented is "cpu.status".
 */
int
pool_cpu_propget(processorid_t cpuid, char *name, nvlist_t *nvl)
{
        int ret = ESRCH;
        cpu_t *cpu;

        ASSERT(pool_lock_held());

        mutex_enter(&cpu_lock);
        if ((cpu = cpu_get(cpuid)) == NULL) {
                mutex_exit(&cpu_lock);
                return (ESRCH);
        }
        if (strcmp(name, "cpu.status") == 0) {
                ret = nvlist_add_string(nvl, "cpu.status",
                    (char *)cpu_get_state_str(cpu->cpu_flags));
        } else {
                ret = EINVAL;
        }
        mutex_exit(&cpu_lock);
        return (ret);
}