root/usr/src/uts/common/syscall/lgrpsys.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */

/*
 * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 * Copyright 2015 Joyent, Inc.
 */

/*
 * lgroup system calls
 */

#include <sys/types.h>
#include <sys/errno.h>
#include <sys/sunddi.h>
#include <sys/systm.h>
#include <sys/mman.h>
#include <sys/cpupart.h>
#include <sys/lgrp.h>
#include <sys/lgrp_user.h>
#include <sys/promif.h>         /* for prom_printf() */
#include <sys/sysmacros.h>
#include <sys/policy.h>

#include <vm/as.h>


/* definitions for mi_validity */
#define VALID_ADDR      1
#define VALID_REQ       2

/*
 * run through the given number of addresses and requests and return the
 * corresponding memory information for each address
 */
static int
meminfo(int addr_count, struct meminfo *mip)
{
        size_t          in_size, out_size, req_size, val_size;
        struct as       *as;
        struct hat      *hat;
        int             i, j, out_idx, info_count;
        lgrp_t          *lgrp;
        pfn_t           pfn;
        ssize_t         pgsz;
        int             *req_array, *val_array;
        uint64_t        *in_array, *out_array;
        uint64_t        addr, paddr;
        uintptr_t       vaddr;
        int             ret = 0;
        struct meminfo minfo;
#if defined(_SYSCALL32_IMPL)
        struct meminfo32 minfo32;
#endif

        /*
         * Make sure that there is at least one address to translate and
         * limit how many virtual addresses the kernel can do per call
         */
        if (addr_count < 1)
                return (set_errno(EINVAL));
        else if (addr_count > MAX_MEMINFO_CNT)
                addr_count = MAX_MEMINFO_CNT;

        if (get_udatamodel() == DATAMODEL_NATIVE) {
                if (copyin(mip, &minfo, sizeof (struct meminfo)))
                        return (set_errno(EFAULT));
        }
#if defined(_SYSCALL32_IMPL)
        else {
                bzero(&minfo, sizeof (minfo));
                if (copyin(mip, &minfo32, sizeof (struct meminfo32)))
                        return (set_errno(EFAULT));
                minfo.mi_inaddr = (const uint64_t *)(uintptr_t)
                    minfo32.mi_inaddr;
                minfo.mi_info_req = (const uint_t *)(uintptr_t)
                    minfo32.mi_info_req;
                minfo.mi_info_count = minfo32.mi_info_count;
                minfo.mi_outdata = (uint64_t *)(uintptr_t)
                    minfo32.mi_outdata;
                minfo.mi_validity = (uint_t *)(uintptr_t)
                    minfo32.mi_validity;
        }
#endif
        /*
         * all the input parameters have been copied in:-
         * addr_count - number of input addresses
         * minfo.mi_inaddr - array of input addresses
         * minfo.mi_info_req - array of types of information requested
         * minfo.mi_info_count - no. of pieces of info requested for each addr
         * minfo.mi_outdata - array into which the results are placed
         * minfo.mi_validity -  array containing bitwise result codes; 0th bit
         *                      evaluates validity of corresponding input
         *                      address, 1st bit validity of response to first
         *                      member of info_req, etc.
         */

        /* make sure mi_info_count is within limit */
        info_count = minfo.mi_info_count;
        if (info_count < 1 || info_count > MAX_MEMINFO_REQ)
                return (set_errno(EINVAL));

        /*
         * allocate buffer in_array for the input addresses and copy them in
         */
        in_size = sizeof (uint64_t) * addr_count;
        in_array = kmem_alloc(in_size, KM_SLEEP);
        if (copyin(minfo.mi_inaddr, in_array, in_size)) {
                kmem_free(in_array, in_size);
                return (set_errno(EFAULT));
        }

        /*
         * allocate buffer req_array for the input info_reqs and copy them in
         */
        req_size = sizeof (uint_t) * info_count;
        req_array = kmem_alloc(req_size, KM_SLEEP);
        if (copyin(minfo.mi_info_req, req_array, req_size)) {
                kmem_free(req_array, req_size);
                kmem_free(in_array, in_size);
                return (set_errno(EFAULT));
        }

        /*
         * Validate privs for each req.
         */
        for (i = 0; i < info_count; i++) {
                switch (req_array[i] & MEMINFO_MASK) {
                case MEMINFO_VLGRP:
                case MEMINFO_VPAGESIZE:
                        break;
                default:
                        if (secpolicy_meminfo(CRED()) != 0) {
                                kmem_free(req_array, req_size);
                                kmem_free(in_array, in_size);
                                return (set_errno(EPERM));
                        }
                        break;
                }
        }

        /*
         * allocate buffer out_array which holds the results and will have
         * to be copied out later
         */
        out_size = sizeof (uint64_t) * addr_count * info_count;
        out_array = kmem_alloc(out_size, KM_SLEEP);

        /*
         * allocate buffer val_array which holds the validity bits and will
         * have to be copied out later
         */
        val_size = sizeof (uint_t) * addr_count;
        val_array = kmem_alloc(val_size, KM_SLEEP);

        if ((req_array[0] & MEMINFO_MASK) == MEMINFO_PLGRP) {
                /* find the corresponding lgroup for each physical address */
                for (i = 0; i < addr_count; i++) {
                        paddr = in_array[i];
                        pfn = btop(paddr);
                        lgrp = lgrp_pfn_to_lgrp(pfn);
                        if (lgrp) {
                                out_array[i] = lgrp->lgrp_id;
                                val_array[i] = VALID_ADDR | VALID_REQ;
                        } else {
                                out_array[i] = 0;
                                val_array[i] = 0;
                        }
                }
        } else {
                /* get the corresponding memory info for each virtual address */
                as = curproc->p_as;

                AS_LOCK_ENTER(as, RW_READER);
                hat = as->a_hat;
                for (i = out_idx = 0; i < addr_count; i++, out_idx +=
                    info_count) {
                        addr = in_array[i];
                        vaddr = (uintptr_t)(addr & ~PAGEOFFSET);
                        if (!as_segat(as, (caddr_t)vaddr)) {
                                val_array[i] = 0;
                                continue;
                        }
                        val_array[i] = VALID_ADDR;
                        pfn = hat_getpfnum(hat, (caddr_t)vaddr);
                        if (pfn != PFN_INVALID) {
                                paddr = (uint64_t)((pfn << PAGESHIFT) |
                                    (addr & PAGEOFFSET));
                                for (j = 0; j < info_count; j++) {
                                        switch (req_array[j] & MEMINFO_MASK) {
                                        case MEMINFO_VPHYSICAL:
                                                /*
                                                 * return the physical address
                                                 * corresponding to the input
                                                 * virtual address
                                                 */
                                                out_array[out_idx + j] = paddr;
                                                val_array[i] |= VALID_REQ << j;
                                                break;
                                        case MEMINFO_VLGRP:
                                                /*
                                                 * return the lgroup of physical
                                                 * page corresponding to the
                                                 * input virtual address
                                                 */
                                                lgrp = lgrp_pfn_to_lgrp(pfn);
                                                if (lgrp) {
                                                        out_array[out_idx + j] =
                                                            lgrp->lgrp_id;
                                                        val_array[i] |=
                                                            VALID_REQ << j;
                                                }
                                                break;
                                        case MEMINFO_VPAGESIZE:
                                                /*
                                                 * return the size of physical
                                                 * page corresponding to the
                                                 * input virtual address
                                                 */
                                                pgsz = hat_getpagesize(hat,
                                                    (caddr_t)vaddr);
                                                if (pgsz != -1) {
                                                        out_array[out_idx + j] =
                                                            pgsz;
                                                        val_array[i] |=
                                                            VALID_REQ << j;
                                                }
                                                break;
                                        case MEMINFO_VREPLCNT:
                                                /*
                                                 * for future use:-
                                                 * return the no. replicated
                                                 * physical pages corresponding
                                                 * to the input virtual address,
                                                 * so it is always 0 at the
                                                 * moment
                                                 */
                                                out_array[out_idx + j] = 0;
                                                val_array[i] |= VALID_REQ << j;
                                                break;
                                        case MEMINFO_VREPL:
                                                /*
                                                 * for future use:-
                                                 * return the nth physical
                                                 * replica of the specified
                                                 * virtual address
                                                 */
                                                break;
                                        case MEMINFO_VREPL_LGRP:
                                                /*
                                                 * for future use:-
                                                 * return the lgroup of nth
                                                 * physical replica of the
                                                 * specified virtual address
                                                 */
                                                break;
                                        case MEMINFO_PLGRP:
                                                /*
                                                 * this is for physical address
                                                 * only, shouldn't mix with
                                                 * virtual address
                                                 */
                                                break;
                                        default:
                                                break;
                                        }
                                }
                        }
                }
                AS_LOCK_EXIT(as);
        }

        /* copy out the results and validity bits and free the buffers */
        if ((copyout(out_array, minfo.mi_outdata, out_size) != 0) ||
            (copyout(val_array, minfo.mi_validity, val_size) != 0))
                ret = set_errno(EFAULT);

        kmem_free(in_array, in_size);
        kmem_free(out_array, out_size);
        kmem_free(req_array, req_size);
        kmem_free(val_array, val_size);

        return (ret);
}


/*
 * Initialize lgroup affinities for thread
 */
void
lgrp_affinity_init(lgrp_affinity_t **bufaddr)
{
        if (bufaddr)
                *bufaddr = NULL;
}


/*
 * Free lgroup affinities for thread and set to NULL
 * just in case thread gets recycled
 */
void
lgrp_affinity_free(lgrp_affinity_t **bufaddr)
{
        if (bufaddr && *bufaddr) {
                kmem_free(*bufaddr, nlgrpsmax * sizeof (lgrp_affinity_t));
                *bufaddr = NULL;
        }
}


#define P_ANY   -2      /* cookie specifying any ID */


/*
 * Find LWP with given ID in specified process and get its affinity for
 * specified lgroup
 */
lgrp_affinity_t
lgrp_affinity_get_thread(proc_t *p, id_t lwpid, lgrp_id_t lgrp)
{
        lgrp_affinity_t aff;
        int             found;
        kthread_t       *t;

        ASSERT(MUTEX_HELD(&p->p_lock));

        aff = LGRP_AFF_NONE;
        found = 0;
        t = p->p_tlist;
        /*
         * The process may be executing in proc_exit() and its p->p_list may be
         * already NULL.
         */
        if (t == NULL)
                return (set_errno(ESRCH));

        do {
                if (t->t_tid == lwpid || lwpid == P_ANY) {
                        thread_lock(t);
                        /*
                         * Check to see whether caller has permission to set
                         * affinity for LWP
                         */
                        if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
                                thread_unlock(t);
                                return (set_errno(EPERM));
                        }

                        if (t->t_lgrp_affinity)
                                aff = t->t_lgrp_affinity[lgrp];
                        thread_unlock(t);
                        found = 1;
                        break;
                }
        } while ((t = t->t_forw) != p->p_tlist);
        if (!found)
                aff = set_errno(ESRCH);

        return (aff);
}


/*
 * Get lgroup affinity for given LWP
 */
lgrp_affinity_t
lgrp_affinity_get(lgrp_affinity_args_t *ap)
{
        lgrp_affinity_t         aff;
        lgrp_affinity_args_t    args;
        id_t                    id;
        idtype_t                idtype;
        lgrp_id_t               lgrp;
        proc_t                  *p;
        kthread_t               *t;

        /*
         * Copyin arguments
         */
        if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
                return (set_errno(EFAULT));

        id = args.id;
        idtype = args.idtype;
        lgrp = args.lgrp;

        /*
         * Check for invalid lgroup
         */
        if (lgrp < 0 || lgrp == LGRP_NONE)
                return (set_errno(EINVAL));

        /*
         * Check for existing lgroup
         */
        if (lgrp > lgrp_alloc_max)
                return (set_errno(ESRCH));

        /*
         * Get lgroup affinity for given LWP or process
         */
        switch (idtype) {

        case P_LWPID:
                /*
                 * LWP in current process
                 */
                p = curproc;
                mutex_enter(&p->p_lock);
                if (id != P_MYID)       /* different thread */
                        aff = lgrp_affinity_get_thread(p, id, lgrp);
                else {                  /* current thread */
                        aff = LGRP_AFF_NONE;
                        t = curthread;
                        thread_lock(t);
                        if (t->t_lgrp_affinity)
                                aff = t->t_lgrp_affinity[lgrp];
                        thread_unlock(t);
                }
                mutex_exit(&p->p_lock);
                break;

        case P_PID:
                /*
                 * Process
                 */
                mutex_enter(&pidlock);

                if (id == P_MYID)
                        p = curproc;
                else {
                        p = prfind(id);
                        if (p == NULL) {
                                mutex_exit(&pidlock);
                                return (set_errno(ESRCH));
                        }
                }

                mutex_enter(&p->p_lock);
                aff = lgrp_affinity_get_thread(p, P_ANY, lgrp);
                mutex_exit(&p->p_lock);

                mutex_exit(&pidlock);
                break;

        default:
                aff = set_errno(EINVAL);
                break;
        }

        return (aff);
}


/*
 * Find lgroup for which this thread has most affinity in specified partition
 * starting from home lgroup unless specified starting lgroup is preferred
 */
lpl_t *
lgrp_affinity_best(kthread_t *t, struct cpupart *cpupart, lgrp_id_t start,
    boolean_t prefer_start)
{
        lgrp_affinity_t *affs;
        lgrp_affinity_t best_aff;
        lpl_t           *best_lpl;
        lgrp_id_t       finish;
        lgrp_id_t       home;
        lgrp_id_t       lgrpid;
        lpl_t           *lpl;

        ASSERT(t != NULL);
        ASSERT((MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0) ||
            (MUTEX_HELD(&ttoproc(t)->p_lock) && THREAD_LOCK_HELD(t)));
        ASSERT(cpupart != NULL);

        if (t->t_lgrp_affinity == NULL)
                return (NULL);

        affs = t->t_lgrp_affinity;

        /*
         * Thread bound to CPU
         */
        if (t->t_bind_cpu != PBIND_NONE) {
                cpu_t   *cp;

                /*
                 * Find which lpl has most affinity among leaf lpl directly
                 * containing CPU and its ancestor lpls
                 */
                cp = cpu[t->t_bind_cpu];

                best_lpl = lpl = cp->cpu_lpl;
                best_aff = affs[best_lpl->lpl_lgrpid];
                while (lpl->lpl_parent != NULL) {
                        lpl = lpl->lpl_parent;
                        lgrpid = lpl->lpl_lgrpid;
                        if (affs[lgrpid] > best_aff) {
                                best_lpl = lpl;
                                best_aff = affs[lgrpid];
                        }
                }
                return (best_lpl);
        }

        /*
         * Start searching from home lgroup unless given starting lgroup is
         * preferred or home lgroup isn't in given pset.  Use root lgroup as
         * starting point if both home and starting lgroups aren't in given
         * pset.
         */
        ASSERT(start >= 0 && start <= lgrp_alloc_max);
        home = t->t_lpl->lpl_lgrpid;
        if (!prefer_start && LGRP_CPUS_IN_PART(home, cpupart))
                lgrpid = home;
        else if (start != LGRP_NONE && LGRP_CPUS_IN_PART(start, cpupart))
                lgrpid = start;
        else
                lgrpid = LGRP_ROOTID;

        best_lpl = &cpupart->cp_lgrploads[lgrpid];
        best_aff = affs[lgrpid];
        finish = lgrpid;
        do {
                /*
                 * Skip any lgroups that don't have CPU resources
                 * in this processor set.
                 */
                if (!LGRP_CPUS_IN_PART(lgrpid, cpupart)) {
                        if (++lgrpid > lgrp_alloc_max)
                                lgrpid = 0;     /* wrap the search */
                        continue;
                }

                /*
                 * Find lgroup with most affinity
                 */
                lpl = &cpupart->cp_lgrploads[lgrpid];
                if (affs[lgrpid] > best_aff) {
                        best_aff = affs[lgrpid];
                        best_lpl = lpl;
                }

                if (++lgrpid > lgrp_alloc_max)
                        lgrpid = 0;     /* wrap the search */

        } while (lgrpid != finish);

        /*
         * No lgroup (in this pset) with any affinity
         */
        if (best_aff == LGRP_AFF_NONE)
                return (NULL);

        lgrpid = best_lpl->lpl_lgrpid;
        ASSERT(LGRP_CPUS_IN_PART(lgrpid, cpupart) && best_lpl->lpl_ncpu > 0);

        return (best_lpl);
}


/*
 * Set thread's affinity for given lgroup
 */
int
lgrp_affinity_set_thread(kthread_t *t, lgrp_id_t lgrp, lgrp_affinity_t aff,
    lgrp_affinity_t **aff_buf)
{
        lgrp_affinity_t *affs;
        lgrp_id_t       best;
        lpl_t           *best_lpl;
        lgrp_id_t       home;
        int             retval;

        ASSERT(t != NULL);
        ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));

        retval = 0;

        thread_lock(t);

        /*
         * Check to see whether caller has permission to set affinity for
         * thread
         */
        if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
                thread_unlock(t);
                return (set_errno(EPERM));
        }

        if (t->t_lgrp_affinity == NULL) {
                if (aff == LGRP_AFF_NONE) {
                        thread_unlock(t);
                        return (0);
                }
                ASSERT(aff_buf != NULL && *aff_buf != NULL);
                t->t_lgrp_affinity = *aff_buf;
                *aff_buf = NULL;
        }

        affs = t->t_lgrp_affinity;
        affs[lgrp] = aff;

        /*
         * Find lgroup for which thread has most affinity,
         * starting with lgroup for which affinity being set
         */
        best_lpl = lgrp_affinity_best(t, t->t_cpupart, lgrp, B_TRUE);

        /*
         * Rehome if found lgroup with more affinity than home or lgroup for
         * which affinity is being set has same affinity as home
         */
        home = t->t_lpl->lpl_lgrpid;
        if (best_lpl != NULL && best_lpl != t->t_lpl) {
                best = best_lpl->lpl_lgrpid;
                if (affs[best] > affs[home] || (affs[best] == affs[home] &&
                    best == lgrp))
                        lgrp_move_thread(t, best_lpl, 1);
        }

        thread_unlock(t);

        return (retval);
}


/*
 * Set process' affinity for specified lgroup
 */
int
lgrp_affinity_set_proc(proc_t *p, lgrp_id_t lgrp, lgrp_affinity_t aff,
    lgrp_affinity_t **aff_buf_array)
{
        lgrp_affinity_t *buf;
        int             err = 0;
        int             i;
        int             retval;
        kthread_t       *t;

        ASSERT(MUTEX_HELD(&pidlock) && MUTEX_HELD(&p->p_lock));
        ASSERT(aff_buf_array != NULL);

        i = 0;
        t = p->p_tlist;
        if (t != NULL) {
                do {
                        /*
                         * Set lgroup affinity for thread
                         */
                        buf = aff_buf_array[i];
                        retval = lgrp_affinity_set_thread(t, lgrp, aff, &buf);

                        if (err == 0 && retval != 0)
                                err = retval;

                        /*
                         * Advance pointer to next buffer
                         */
                        if (buf == NULL) {
                                ASSERT(i < p->p_lwpcnt);
                                aff_buf_array[i] = NULL;
                                i++;
                        }

                } while ((t = t->t_forw) != p->p_tlist);
        }
        return (err);
}


/*
 * Set LWP's or process' affinity for specified lgroup
 *
 * When setting affinities, pidlock, process p_lock, and thread_lock()
 * need to be held in that order to protect target thread's pset, process,
 * process contents, and thread contents.  thread_lock() does splhigh(),
 * so it ends up having similiar effect as kpreempt_disable(), so it will
 * protect calls to lgrp_move_thread() and lgrp_choose() from pset changes.
 */
int
lgrp_affinity_set(lgrp_affinity_args_t *ap)
{
        lgrp_affinity_t         aff;
        lgrp_affinity_t         *aff_buf;
        lgrp_affinity_args_t    args;
        id_t                    id;
        idtype_t                idtype;
        lgrp_id_t               lgrp;
        int                     nthreads;
        proc_t                  *p;
        int                     retval;

        /*
         * Copyin arguments
         */
        if (copyin(ap, &args, sizeof (lgrp_affinity_args_t)) != 0)
                return (set_errno(EFAULT));

        idtype = args.idtype;
        id = args.id;
        lgrp = args.lgrp;
        aff = args.aff;

        /*
         * Check for invalid lgroup
         */
        if (lgrp < 0 || lgrp == LGRP_NONE)
                return (set_errno(EINVAL));

        /*
         * Check for existing lgroup
         */
        if (lgrp > lgrp_alloc_max)
                return (set_errno(ESRCH));

        /*
         * Check for legal affinity
         */
        if (aff != LGRP_AFF_NONE && aff != LGRP_AFF_WEAK &&
            aff != LGRP_AFF_STRONG)
                return (set_errno(EINVAL));

        /*
         * Must be process or LWP ID
         */
        if (idtype != P_LWPID && idtype != P_PID)
                return (set_errno(EINVAL));

        retval = EINVAL;
        /*
         * Set given LWP's or process' affinity for specified lgroup
         */
        switch (idtype) {

        case P_LWPID:
                /*
                 * Allocate memory for thread's lgroup affinities
                 * ahead of time w/o holding locks
                 */
                aff_buf = kmem_zalloc(nlgrpsmax * sizeof (lgrp_affinity_t),
                    KM_SLEEP);

                p = curproc;

                /*
                 * Set affinity for thread
                 */
                mutex_enter(&p->p_lock);
                if (id == P_MYID) {             /* current thread */
                        retval = lgrp_affinity_set_thread(curthread, lgrp, aff,
                            &aff_buf);
                } else if (p->p_tlist == NULL) {
                        retval = set_errno(ESRCH);
                } else {                        /* other thread */
                        int             found = 0;
                        kthread_t       *t;

                        t = p->p_tlist;
                        do {
                                if (t->t_tid == id) {
                                        retval = lgrp_affinity_set_thread(t,
                                            lgrp, aff, &aff_buf);
                                        found = 1;
                                        break;
                                }
                        } while ((t = t->t_forw) != p->p_tlist);
                        if (!found)
                                retval = set_errno(ESRCH);
                }
                mutex_exit(&p->p_lock);

                /*
                 * Free memory for lgroup affinities,
                 * since thread didn't need it
                 */
                if (aff_buf)
                        kmem_free(aff_buf,
                            nlgrpsmax * sizeof (lgrp_affinity_t));

                break;

        case P_PID:

                do {
                        lgrp_affinity_t **aff_buf_array;
                        int             i;
                        size_t          size;

                        /*
                         * Get process
                         */
                        mutex_enter(&pidlock);

                        if (id == P_MYID)
                                p = curproc;
                        else
                                p = prfind(id);

                        if (p == NULL) {
                                mutex_exit(&pidlock);
                                return (set_errno(ESRCH));
                        }

                        /*
                         * Get number of threads in process
                         *
                         * NOTE: Only care about user processes,
                         *       so p_lwpcnt should be number of threads.
                         */
                        mutex_enter(&p->p_lock);
                        nthreads = p->p_lwpcnt;
                        mutex_exit(&p->p_lock);

                        mutex_exit(&pidlock);

                        if (nthreads < 1)
                                return (set_errno(ESRCH));

                        /*
                         * Preallocate memory for lgroup affinities for
                         * each thread in process now to avoid holding
                         * any locks.  Allocate an array to hold a buffer
                         * for each thread.
                         */
                        aff_buf_array = kmem_zalloc(nthreads *
                            sizeof (lgrp_affinity_t *), KM_SLEEP);

                        size = nlgrpsmax * sizeof (lgrp_affinity_t);
                        for (i = 0; i < nthreads; i++)
                                aff_buf_array[i] = kmem_zalloc(size, KM_SLEEP);

                        mutex_enter(&pidlock);

                        /*
                         * Get process again since dropped locks to allocate
                         * memory (except current process)
                         */
                        if (id != P_MYID)
                                p = prfind(id);

                        /*
                         * Process went away after we dropped locks and before
                         * reacquiring them, so drop locks, free memory, and
                         * return.
                         */
                        if (p == NULL) {
                                mutex_exit(&pidlock);
                                for (i = 0; i < nthreads; i++)
                                        kmem_free(aff_buf_array[i], size);
                                kmem_free(aff_buf_array,
                                    nthreads * sizeof (lgrp_affinity_t *));
                                return (set_errno(ESRCH));
                        }

                        mutex_enter(&p->p_lock);

                        /*
                         * See whether number of threads is same
                         * If not, drop locks, free memory, and try again
                         */
                        if (nthreads != p->p_lwpcnt) {
                                mutex_exit(&p->p_lock);
                                mutex_exit(&pidlock);
                                for (i = 0; i < nthreads; i++)
                                        kmem_free(aff_buf_array[i], size);
                                kmem_free(aff_buf_array,
                                    nthreads * sizeof (lgrp_affinity_t *));
                                continue;
                        }

                        /*
                         * Set lgroup affinity for threads in process
                         */
                        retval = lgrp_affinity_set_proc(p, lgrp, aff,
                            aff_buf_array);

                        mutex_exit(&p->p_lock);
                        mutex_exit(&pidlock);

                        /*
                         * Free any leftover memory, since some threads may
                         * have already allocated memory and set lgroup
                         * affinities before
                         */
                        for (i = 0; i < nthreads; i++)
                                if (aff_buf_array[i] != NULL)
                                        kmem_free(aff_buf_array[i], size);
                        kmem_free(aff_buf_array,
                            nthreads * sizeof (lgrp_affinity_t *));

                        break;

                } while (nthreads != p->p_lwpcnt);

                break;

        default:
                retval = set_errno(EINVAL);
                break;
        }

        return (retval);
}


/*
 * Return the latest generation number for the lgroup hierarchy
 * with the given view
 */
lgrp_gen_t
lgrp_generation(lgrp_view_t view)
{
        cpupart_t       *cpupart;
        uint_t          gen;

        kpreempt_disable();

        /*
         * Determine generation number for given view
         */
        if (view == LGRP_VIEW_OS)
                /*
                 * Return generation number of lgroup hierarchy for OS view
                 */
                gen = lgrp_gen;
        else {
                /*
                 * For caller's view, use generation numbers for lgroup
                 * hierarchy and caller's pset
                 * NOTE: Caller needs to check for change in pset ID
                 */
                cpupart = curthread->t_cpupart;
                ASSERT(cpupart);
                gen = lgrp_gen + cpupart->cp_gen;
        }

        kpreempt_enable();

        return (gen);
}


lgrp_id_t
lgrp_home_thread(kthread_t *t)
{
        lgrp_id_t       home;

        ASSERT(t != NULL);
        ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock));

        thread_lock(t);

        /*
         * Check to see whether caller has permission to set affinity for
         * thread
         */
        if (t->t_cid == 0 || !hasprocperm(t->t_cred, CRED())) {
                thread_unlock(t);
                return (set_errno(EPERM));
        }

        home = lgrp_home_id(t);

        thread_unlock(t);
        return (home);
}


/*
 * Get home lgroup of given process or thread
 */
lgrp_id_t
lgrp_home_get(idtype_t idtype, id_t id)
{
        proc_t          *p;
        lgrp_id_t       retval;
        kthread_t       *t;

        /*
         * Get home lgroup of given LWP or process
         */
        switch (idtype) {

        case P_LWPID:
                p = curproc;

                /*
                 * Set affinity for thread
                 */
                mutex_enter(&p->p_lock);
                if (id == P_MYID) {             /* current thread */
                        retval = lgrp_home_thread(curthread);
                } else if (p->p_tlist == NULL) {
                        retval = set_errno(ESRCH);
                } else {                        /* other thread */
                        int     found = 0;

                        t = p->p_tlist;
                        do {
                                if (t->t_tid == id) {
                                        retval = lgrp_home_thread(t);
                                        found = 1;
                                        break;
                                }
                        } while ((t = t->t_forw) != p->p_tlist);
                        if (!found)
                                retval = set_errno(ESRCH);
                }
                mutex_exit(&p->p_lock);
                break;

        case P_PID:
                /*
                 * Get process
                 */
                mutex_enter(&pidlock);

                if (id == P_MYID)
                        p = curproc;
                else
                        p = prfind(id);

                if (p == NULL) {
                        mutex_exit(&pidlock);
                        return (set_errno(ESRCH));
                }

                mutex_enter(&p->p_lock);
                t = p->p_tlist;
                if (t == NULL)
                        retval = set_errno(ESRCH);
                else
                        retval = lgrp_home_thread(t);
                mutex_exit(&p->p_lock);

                mutex_exit(&pidlock);

                break;

        default:
                retval = set_errno(EINVAL);
                break;
        }

        return (retval);
}


/*
 * Return latency between "from" and "to" lgroups
 *
 * This latency number can only be used for relative comparison
 * between lgroups on the running system, cannot be used across platforms,
 * and may not reflect the actual latency.  It is platform and implementation
 * specific, so platform gets to decide its value.  It would be nice if the
 * number was at least proportional to make comparisons more meaningful though.
 */
int
lgrp_latency(lgrp_id_t from, lgrp_id_t to)
{
        lgrp_t          *from_lgrp;
        int             i;
        int             latency;
        int             latency_max;
        lgrp_t          *to_lgrp;

        ASSERT(MUTEX_HELD(&cpu_lock));

        if (from < 0 || to < 0)
                return (set_errno(EINVAL));

        if (from > lgrp_alloc_max || to > lgrp_alloc_max)
                return (set_errno(ESRCH));

        from_lgrp = lgrp_table[from];
        to_lgrp = lgrp_table[to];

        if (!LGRP_EXISTS(from_lgrp) || !LGRP_EXISTS(to_lgrp)) {
                return (set_errno(ESRCH));
        }

        /*
         * Get latency for same lgroup
         */
        if (from == to) {
                latency = from_lgrp->lgrp_latency;
                return (latency);
        }

        /*
         * Get latency between leaf lgroups
         */
        if (from_lgrp->lgrp_childcnt == 0 && to_lgrp->lgrp_childcnt == 0)
                return (lgrp_plat_latency(from_lgrp->lgrp_plathand,
                    to_lgrp->lgrp_plathand));

        /*
         * Determine max latency between resources in two lgroups
         */
        latency_max = 0;
        for (i = 0; i <= lgrp_alloc_max; i++) {
                lgrp_t  *from_rsrc;
                int     j;
                lgrp_t  *to_rsrc;

                from_rsrc = lgrp_table[i];
                if (!LGRP_EXISTS(from_rsrc) ||
                    !klgrpset_ismember(from_lgrp->lgrp_set[LGRP_RSRC_CPU], i))
                        continue;

                for (j = 0; j <= lgrp_alloc_max; j++) {
                        to_rsrc = lgrp_table[j];
                        if (!LGRP_EXISTS(to_rsrc) ||
                            klgrpset_ismember(to_lgrp->lgrp_set[LGRP_RSRC_MEM],
                            j) == 0)
                                continue;
                        latency = lgrp_plat_latency(from_rsrc->lgrp_plathand,
                            to_rsrc->lgrp_plathand);
                        if (latency > latency_max)
                                latency_max = latency;
                }
        }
        return (latency_max);
}


/*
 * Return lgroup interface version number
 * 0 - none
 * 1 - original
 * 2 - lgrp_latency_cookie() and lgrp_resources() added
 */
int
lgrp_version(int version)
{
        /*
         * Return LGRP_VER_NONE when requested version isn't supported
         */
        if (version < LGRP_VER_NONE || version > LGRP_VER_CURRENT)
                return (LGRP_VER_NONE);

        /*
         * Return current version when LGRP_VER_NONE passed in
         */
        if (version == LGRP_VER_NONE)
                return (LGRP_VER_CURRENT);

        /*
         * Otherwise, return supported version.
         */
        return (version);
}


/*
 * Snapshot of lgroup hieararchy
 *
 * One snapshot is kept and is based on the kernel's native data model, so
 * a 32-bit snapshot is kept for the 32-bit kernel and a 64-bit one for the
 * 64-bit kernel.  If a 32-bit user wants a snapshot from the 64-bit kernel,
 * the kernel generates a 32-bit snapshot from the data in its 64-bit snapshot.
 *
 * The format is defined by lgroup snapshot header and the layout of
 * the snapshot in memory is as follows:
 * 1) lgroup snapshot header
 *    - specifies format of snapshot
 *    - defined by lgrp_snapshot_header_t
 * 2) lgroup info array
 *    - contains information about each lgroup
 *    - one element for each lgroup
 *    - each element is defined by lgrp_info_t
 * 3) lgroup CPU ID array
 *    - contains list (array) of CPU IDs for each lgroup
 *    - lgrp_info_t points into array and specifies how many CPUs belong to
 *      given lgroup
 * 4) lgroup parents array
 *    - contains lgroup bitmask of parents for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 5) lgroup children array
 *    - contains lgroup bitmask of children for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 6) lgroup resources array
 *    - contains lgroup bitmask of resources for each lgroup
 *    - bitmask is an array of unsigned longs and its size depends on nlgrpsmax
 * 7) lgroup latency table
 *    - contains latency from each lgroup to each of other lgroups
 *
 * NOTE:  Must use nlgrpsmax for per lgroup data structures because lgroups
 *        may be sparsely allocated.
 */
lgrp_snapshot_header_t  *lgrp_snap = NULL;      /* lgroup snapshot */
static kmutex_t         lgrp_snap_lock;         /* snapshot lock */


/*
 * Take a snapshot of lgroup hierarchy and return size of buffer
 * needed to hold snapshot
 */
static int
lgrp_snapshot(void)
{
        size_t          bitmask_size;
        size_t          bitmasks_size;
        size_t          bufsize;
        int             cpu_index;
        size_t          cpuids_size;
        int             i;
        int             j;
        size_t          info_size;
        size_t          lats_size;
        ulong_t         *lgrp_children;
        processorid_t   *lgrp_cpuids;
        lgrp_info_t     *lgrp_info;
        int             **lgrp_lats;
        ulong_t         *lgrp_parents;
        ulong_t         *lgrp_rsets;
        ulong_t         *lgrpset;
        int             snap_ncpus;
        int             snap_nlgrps;
        int             snap_nlgrpsmax;
        size_t          snap_hdr_size;
#ifdef  _SYSCALL32_IMPL
        model_t         model = DATAMODEL_NATIVE;

        /*
         * Have up-to-date snapshot, so check to see whether caller is 32-bit
         * program and need to return size of 32-bit snapshot now.
         */
        model = get_udatamodel();
        if (model == DATAMODEL_ILP32 && lgrp_snap &&
            lgrp_snap->ss_gen == lgrp_gen) {

                snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

                /*
                 * Calculate size of buffer needed for 32-bit snapshot,
                 * rounding up size of each object to allow for alignment
                 * of next object in buffer.
                 */
                snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
                    sizeof (caddr32_t));
                info_size =
                    P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
                    sizeof (processorid_t));
                cpuids_size =
                    P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
                    sizeof (ulong_t));

                /*
                 * lgroup bitmasks needed for parents, children, and resources
                 * for each lgroup and pset lgroup set
                 */
                bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
                bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
                    snap_nlgrpsmax) + 1) * bitmask_size;

                /*
                 * Size of latency table and buffer
                 */
                lats_size = snap_nlgrpsmax * sizeof (caddr32_t) +
                    snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);

                bufsize = snap_hdr_size + info_size + cpuids_size +
                    bitmasks_size + lats_size;
                return (bufsize);
        }
#endif  /* _SYSCALL32_IMPL */

        /*
         * Check whether snapshot is up-to-date
         * Free it and take another one if not
         */
        if (lgrp_snap) {
                if (lgrp_snap->ss_gen == lgrp_gen)
                        return (lgrp_snap->ss_size);

                kmem_free(lgrp_snap, lgrp_snap->ss_size);
                lgrp_snap = NULL;
        }

        /*
         * Allocate memory for snapshot
         * w/o holding cpu_lock while waiting for memory
         */
        while (lgrp_snap == NULL) {
                int     old_generation;

                /*
                 * Take snapshot of lgroup generation number
                 * and configuration size dependent information
                 * NOTE: Only count number of online CPUs,
                 * since only online CPUs appear in lgroups.
                 */
                mutex_enter(&cpu_lock);
                old_generation = lgrp_gen;
                snap_ncpus = ncpus_online;
                snap_nlgrps = nlgrps;
                snap_nlgrpsmax = nlgrpsmax;
                mutex_exit(&cpu_lock);

                /*
                 * Calculate size of buffer needed for snapshot,
                 * rounding up size of each object to allow for alignment
                 * of next object in buffer.
                 */
                snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
                    sizeof (void *));
                info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
                    sizeof (processorid_t));
                cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
                    sizeof (ulong_t));
                /*
                 * lgroup bitmasks needed for pset lgroup set and  parents,
                 * children, and resource sets for each lgroup
                 */
                bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
                bitmasks_size = (((2 + LGRP_RSRC_COUNT) *
                    snap_nlgrpsmax) + 1) * bitmask_size;

                /*
                 * Size of latency table and buffer
                 */
                lats_size = snap_nlgrpsmax * sizeof (int *) +
                    snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int);

                bufsize = snap_hdr_size + info_size + cpuids_size +
                    bitmasks_size + lats_size;

                /*
                 * Allocate memory for buffer
                 */
                lgrp_snap = kmem_zalloc(bufsize, KM_NOSLEEP);
                if (lgrp_snap == NULL)
                        return (set_errno(ENOMEM));

                /*
                 * Check whether generation number has changed
                 */
                mutex_enter(&cpu_lock);
                if (lgrp_gen == old_generation)
                        break;          /* hasn't change, so done. */

                /*
                 * Generation number changed, so free memory and try again.
                 */
                mutex_exit(&cpu_lock);
                kmem_free(lgrp_snap, bufsize);
                lgrp_snap = NULL;
        }

        /*
         * Fill in lgroup snapshot header
         * (including pointers to tables of lgroup info, CPU IDs, and parents
         * and children)
         */
        lgrp_snap->ss_version = LGRP_VER_CURRENT;

        /*
         * XXX For now, liblgrp only needs to know whether the hierarchy
         * XXX only has one level or not
         */
        if (snap_nlgrps == 1)
                lgrp_snap->ss_levels = 1;
        else
                lgrp_snap->ss_levels = 2;

        lgrp_snap->ss_root = LGRP_ROOTID;

        lgrp_snap->ss_nlgrps = lgrp_snap->ss_nlgrps_os = snap_nlgrps;
        lgrp_snap->ss_nlgrps_max = snap_nlgrpsmax;
        lgrp_snap->ss_ncpus = snap_ncpus;
        lgrp_snap->ss_gen = lgrp_gen;
        lgrp_snap->ss_view = LGRP_VIEW_OS;
        lgrp_snap->ss_pset = 0;         /* NOTE: caller should set if needed */
        lgrp_snap->ss_size = bufsize;
        lgrp_snap->ss_magic = (uintptr_t)lgrp_snap;

        lgrp_snap->ss_info = lgrp_info =
            (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);

        lgrp_snap->ss_cpuids = lgrp_cpuids =
            (processorid_t *)((uintptr_t)lgrp_info + info_size);

        lgrp_snap->ss_lgrpset = lgrpset =
            (ulong_t *)((uintptr_t)lgrp_cpuids + cpuids_size);

        lgrp_snap->ss_parents = lgrp_parents =
            (ulong_t *)((uintptr_t)lgrpset + bitmask_size);

        lgrp_snap->ss_children = lgrp_children =
            (ulong_t *)((uintptr_t)lgrp_parents + (snap_nlgrpsmax *
            bitmask_size));

        lgrp_snap->ss_rsets = lgrp_rsets =
            (ulong_t *)((uintptr_t)lgrp_children + (snap_nlgrpsmax *
            bitmask_size));

        lgrp_snap->ss_latencies = lgrp_lats =
            (int **)((uintptr_t)lgrp_rsets + (LGRP_RSRC_COUNT *
            snap_nlgrpsmax * bitmask_size));

        /*
         * Fill in lgroup information
         */
        cpu_index = 0;
        for (i = 0; i < snap_nlgrpsmax; i++) {
                struct cpu      *cp;
                int             cpu_count;
                struct cpu      *head;
                int             k;
                lgrp_t          *lgrp;

                lgrp = lgrp_table[i];
                if (!LGRP_EXISTS(lgrp)) {
                        bzero(&lgrp_info[i], sizeof (lgrp_info[i]));
                        lgrp_info[i].info_lgrpid = LGRP_NONE;
                        continue;
                }

                lgrp_info[i].info_lgrpid = i;
                lgrp_info[i].info_latency = lgrp->lgrp_latency;

                /*
                 * Fill in parents, children, and lgroup resources
                 */
                lgrp_info[i].info_parents =
                    (ulong_t *)((uintptr_t)lgrp_parents + (i * bitmask_size));

                if (lgrp->lgrp_parent)
                        BT_SET(lgrp_info[i].info_parents,
                            lgrp->lgrp_parent->lgrp_id);

                lgrp_info[i].info_children =
                    (ulong_t *)((uintptr_t)lgrp_children + (i * bitmask_size));

                for (j = 0; j < snap_nlgrpsmax; j++)
                        if (klgrpset_ismember(lgrp->lgrp_children, j))
                                BT_SET(lgrp_info[i].info_children, j);

                lgrp_info[i].info_rset =
                    (ulong_t *)((uintptr_t)lgrp_rsets +
                    (i * LGRP_RSRC_COUNT * bitmask_size));

                for (j = 0; j < LGRP_RSRC_COUNT; j++) {
                        ulong_t *rset;

                        rset = (ulong_t *)((uintptr_t)lgrp_info[i].info_rset +
                            (j * bitmask_size));
                        for (k = 0; k < snap_nlgrpsmax; k++)
                                if (klgrpset_ismember(lgrp->lgrp_set[j], k))
                                        BT_SET(rset, k);
                }

                /*
                 * Fill in CPU IDs
                 */
                cpu_count = 0;
                lgrp_info[i].info_cpuids = NULL;
                cp = head = lgrp->lgrp_cpu;
                if (head != NULL) {
                        lgrp_info[i].info_cpuids = &lgrp_cpuids[cpu_index];
                        do {
                                lgrp_cpuids[cpu_index] = cp->cpu_id;
                                cpu_index++;
                                cpu_count++;
                                cp = cp->cpu_next_lgrp;
                        } while (cp != head);
                }
                ASSERT(cpu_count == lgrp->lgrp_cpucnt);
                lgrp_info[i].info_ncpus = cpu_count;

                /*
                 * Fill in memory sizes for lgroups that directly contain
                 * memory
                 */
                if (klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], i)) {
                        lgrp_info[i].info_mem_free =
                            lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);
                        lgrp_info[i].info_mem_install =
                            lgrp_mem_size(i, LGRP_MEM_SIZE_INSTALL);
                }

                /*
                 * Fill in latency table and buffer
                 */
                lgrp_lats[i] = (int *)((uintptr_t)lgrp_lats + snap_nlgrpsmax *
                    sizeof (int *) + i * snap_nlgrpsmax * sizeof (int));
                for (j = 0; j < snap_nlgrpsmax; j++) {
                        lgrp_t  *to;

                        to = lgrp_table[j];
                        if (!LGRP_EXISTS(to))
                                continue;
                        lgrp_lats[i][j] = lgrp_latency(lgrp->lgrp_id,
                            to->lgrp_id);
                }
        }
        ASSERT(cpu_index == snap_ncpus);


        mutex_exit(&cpu_lock);

#ifdef  _SYSCALL32_IMPL
        /*
         * Check to see whether caller is 32-bit program and need to return
         * size of 32-bit snapshot now that snapshot has been taken/updated.
         * May not have been able to do this earlier if snapshot was out of
         * date or didn't exist yet.
         */
        if (model == DATAMODEL_ILP32) {

                snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

                /*
                 * Calculate size of buffer needed for 32-bit snapshot,
                 * rounding up size of each object to allow for alignment
                 * of next object in buffer.
                 */
                snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
                    sizeof (caddr32_t));
                info_size =
                    P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
                    sizeof (processorid_t));
                cpuids_size =
                    P2ROUNDUP(lgrp_snap->ss_ncpus * sizeof (processorid_t),
                    sizeof (ulong_t));

                bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);
                bitmasks_size = (((2 + LGRP_RSRC_COUNT) * snap_nlgrpsmax) +
                    1) * bitmask_size;


                /*
                 * Size of latency table and buffer
                 */
                lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
                    (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));

                bufsize = snap_hdr_size + info_size + cpuids_size +
                    bitmasks_size + lats_size;
                return (bufsize);
        }
#endif  /* _SYSCALL32_IMPL */

        return (lgrp_snap->ss_size);
}


/*
 * Copy snapshot into given user buffer, fix up any pointers in buffer to point
 * into user instead of kernel address space, and return size of buffer
 * needed to hold snapshot
 */
static int
lgrp_snapshot_copy(char *buf, size_t bufsize)
{
        size_t                  bitmask_size;
        int                     cpu_index;
        size_t                  cpuids_size;
        int                     i;
        size_t                  info_size;
        lgrp_info_t             *lgrp_info;
        int                     retval;
        size_t                  snap_hdr_size;
        int                     snap_ncpus;
        int                     snap_nlgrpsmax;
        lgrp_snapshot_header_t  *user_snap;
        lgrp_info_t             *user_info;
        lgrp_info_t             *user_info_buffer;
        processorid_t           *user_cpuids;
        ulong_t                 *user_lgrpset;
        ulong_t                 *user_parents;
        ulong_t                 *user_children;
        int                     **user_lats;
        int                     **user_lats_buffer;
        ulong_t                 *user_rsets;

        if (lgrp_snap == NULL)
                return (0);

        if (buf == NULL || bufsize <= 0)
                return (lgrp_snap->ss_size);

        /*
         * User needs to try getting size of buffer again
         * because given buffer size is too small.
         * The lgroup hierarchy may have changed after they asked for the size
         * but before the snapshot was taken.
         */
        if (bufsize < lgrp_snap->ss_size)
                return (set_errno(EAGAIN));

        snap_ncpus = lgrp_snap->ss_ncpus;
        snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

        /*
         * Fill in lgrpset now because caller may have change psets
         */
        kpreempt_disable();
        for (i = 0; i < snap_nlgrpsmax; i++) {
                if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
                    i)) {
                        BT_SET(lgrp_snap->ss_lgrpset, i);
                }
        }
        kpreempt_enable();

        /*
         * Copy lgroup snapshot (snapshot header, lgroup info, and CPU IDs)
         * into user buffer all at once
         */
        if (copyout(lgrp_snap, buf, lgrp_snap->ss_size) != 0)
                return (set_errno(EFAULT));

        /*
         * Round up sizes of lgroup snapshot header and info for alignment
         */
        snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header_t),
            sizeof (void *));
        info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info_t),
            sizeof (processorid_t));
        cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
            sizeof (ulong_t));

        bitmask_size = BT_SIZEOFMAP(snap_nlgrpsmax);

        /*
         * Calculate pointers into user buffer for lgroup snapshot header,
         * info, and CPU IDs
         */
        user_snap = (lgrp_snapshot_header_t *)buf;
        user_info = (lgrp_info_t *)((uintptr_t)user_snap + snap_hdr_size);
        user_cpuids = (processorid_t *)((uintptr_t)user_info + info_size);
        user_lgrpset = (ulong_t *)((uintptr_t)user_cpuids + cpuids_size);
        user_parents = (ulong_t *)((uintptr_t)user_lgrpset + bitmask_size);
        user_children = (ulong_t *)((uintptr_t)user_parents +
            (snap_nlgrpsmax * bitmask_size));
        user_rsets = (ulong_t *)((uintptr_t)user_children +
            (snap_nlgrpsmax * bitmask_size));
        user_lats = (int **)((uintptr_t)user_rsets +
            (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size));

        /*
         * Copyout magic number (ie. pointer to beginning of buffer)
         */
        if (copyout(&buf, &user_snap->ss_magic, sizeof (buf)) != 0)
                return (set_errno(EFAULT));

        /*
         * Fix up pointers in user buffer to point into user buffer
         * not kernel snapshot
         */
        if (copyout(&user_info, &user_snap->ss_info, sizeof (user_info)) != 0)
                return (set_errno(EFAULT));

        if (copyout(&user_cpuids, &user_snap->ss_cpuids,
            sizeof (user_cpuids)) != 0)
                return (set_errno(EFAULT));

        if (copyout(&user_lgrpset, &user_snap->ss_lgrpset,
            sizeof (user_lgrpset)) != 0)
                return (set_errno(EFAULT));

        if (copyout(&user_parents, &user_snap->ss_parents,
            sizeof (user_parents)) != 0)
                return (set_errno(EFAULT));

        if (copyout(&user_children, &user_snap->ss_children,
            sizeof (user_children)) != 0)
                return (set_errno(EFAULT));

        if (copyout(&user_rsets, &user_snap->ss_rsets,
            sizeof (user_rsets)) != 0)
                return (set_errno(EFAULT));

        if (copyout(&user_lats, &user_snap->ss_latencies,
            sizeof (user_lats)) != 0)
                return (set_errno(EFAULT));

        /*
         * Make copies of lgroup info and latency table, fix up pointers,
         * and then copy them into user buffer
         */
        user_info_buffer = kmem_zalloc(info_size, KM_NOSLEEP);
        if (user_info_buffer == NULL)
                return (set_errno(ENOMEM));

        user_lats_buffer = kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
            KM_NOSLEEP);
        if (user_lats_buffer == NULL) {
                kmem_free(user_info_buffer, info_size);
                return (set_errno(ENOMEM));
        }

        lgrp_info = (lgrp_info_t *)((uintptr_t)lgrp_snap + snap_hdr_size);
        bcopy(lgrp_info, user_info_buffer, info_size);

        cpu_index = 0;
        for (i = 0; i < snap_nlgrpsmax; i++) {
                ulong_t *snap_rset;

                /*
                 * Skip non-existent lgroups
                 */
                if (user_info_buffer[i].info_lgrpid == LGRP_NONE)
                        continue;

                /*
                 * Update free memory size since it changes frequently
                 * Only do so for lgroups directly containing memory
                 *
                 * NOTE: This must be done before changing the pointers to
                 *       point into user space since we need to dereference
                 *       lgroup resource set
                 */
                snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
                    BT_BITOUL(snap_nlgrpsmax)];
                if (BT_TEST(snap_rset, i))
                        user_info_buffer[i].info_mem_free =
                            lgrp_mem_size(i, LGRP_MEM_SIZE_FREE);

                /*
                 * Fix up pointers to parents, children, resources, and
                 * latencies
                 */
                user_info_buffer[i].info_parents =
                    (ulong_t *)((uintptr_t)user_parents + (i * bitmask_size));
                user_info_buffer[i].info_children =
                    (ulong_t *)((uintptr_t)user_children + (i * bitmask_size));
                user_info_buffer[i].info_rset =
                    (ulong_t *)((uintptr_t)user_rsets +
                    (i * LGRP_RSRC_COUNT * bitmask_size));
                user_lats_buffer[i] = (int *)((uintptr_t)user_lats +
                    (snap_nlgrpsmax * sizeof (int *)) + (i * snap_nlgrpsmax *
                    sizeof (int)));

                /*
                 * Fix up pointer to CPU IDs
                 */
                if (user_info_buffer[i].info_ncpus == 0) {
                        user_info_buffer[i].info_cpuids = NULL;
                        continue;
                }
                user_info_buffer[i].info_cpuids = &user_cpuids[cpu_index];
                cpu_index += user_info_buffer[i].info_ncpus;
        }
        ASSERT(cpu_index == snap_ncpus);

        /*
         * Copy lgroup info and latency table with pointers fixed up to point
         * into user buffer out to user buffer now
         */
        retval = lgrp_snap->ss_size;
        if (copyout(user_info_buffer, user_info, info_size) != 0)
                retval = set_errno(EFAULT);
        kmem_free(user_info_buffer, info_size);

        if (copyout(user_lats_buffer, user_lats, snap_nlgrpsmax *
            sizeof (int *)) != 0)
                retval = set_errno(EFAULT);
        kmem_free(user_lats_buffer, snap_nlgrpsmax * sizeof (int *));

        return (retval);
}


#ifdef  _SYSCALL32_IMPL
/*
 * Make 32-bit copy of snapshot, fix up any pointers in buffer to point
 * into user instead of kernel address space, copy 32-bit snapshot into
 * given user buffer, and return size of buffer needed to hold snapshot
 */
static int
lgrp_snapshot_copy32(caddr32_t buf, size32_t bufsize)
{
        size32_t                        bitmask_size;
        size32_t                        bitmasks_size;
        size32_t                        children_size;
        int                             cpu_index;
        size32_t                        cpuids_size;
        int                             i;
        int                             j;
        size32_t                        info_size;
        size32_t                        lats_size;
        lgrp_info_t                     *lgrp_info;
        lgrp_snapshot_header32_t        *lgrp_snap32;
        lgrp_info32_t                   *lgrp_info32;
        processorid_t                   *lgrp_cpuids32;
        caddr32_t                       *lgrp_lats32;
        int                             **lgrp_lats32_kernel;
        uint_t                          *lgrp_set32;
        uint_t                          *lgrp_parents32;
        uint_t                          *lgrp_children32;
        uint_t                          *lgrp_rsets32;
        size32_t                        parents_size;
        size32_t                        rsets_size;
        size32_t                        set_size;
        size32_t                        snap_hdr_size;
        int                             snap_ncpus;
        int                             snap_nlgrpsmax;
        size32_t                        snap_size;

        if (lgrp_snap == NULL)
                return (0);

        snap_ncpus = lgrp_snap->ss_ncpus;
        snap_nlgrpsmax = lgrp_snap->ss_nlgrps_max;

        /*
         * Calculate size of buffer needed for 32-bit snapshot,
         * rounding up size of each object to allow for alignment
         * of next object in buffer.
         */
        snap_hdr_size = P2ROUNDUP(sizeof (lgrp_snapshot_header32_t),
            sizeof (caddr32_t));
        info_size = P2ROUNDUP(snap_nlgrpsmax * sizeof (lgrp_info32_t),
            sizeof (processorid_t));
        cpuids_size = P2ROUNDUP(snap_ncpus * sizeof (processorid_t),
            sizeof (ulong_t));

        bitmask_size = BT_SIZEOFMAP32(snap_nlgrpsmax);

        set_size = bitmask_size;
        parents_size = snap_nlgrpsmax * bitmask_size;
        children_size = snap_nlgrpsmax * bitmask_size;
        rsets_size = P2ROUNDUP(LGRP_RSRC_COUNT * snap_nlgrpsmax *
            (int)bitmask_size, sizeof (caddr32_t));

        bitmasks_size = set_size + parents_size + children_size + rsets_size;

        /*
         * Size of latency table and buffer
         */
        lats_size = (snap_nlgrpsmax * sizeof (caddr32_t)) +
            (snap_nlgrpsmax * snap_nlgrpsmax * sizeof (int));

        snap_size = snap_hdr_size + info_size + cpuids_size + bitmasks_size +
            lats_size;

        if (buf == 0 || bufsize <= 0) {
                return (snap_size);
        }

        /*
         * User needs to try getting size of buffer again
         * because given buffer size is too small.
         * The lgroup hierarchy may have changed after they asked for the size
         * but before the snapshot was taken.
         */
        if (bufsize < snap_size)
                return (set_errno(EAGAIN));

        /*
         * Make 32-bit copy of snapshot, fix up pointers to point into user
         * buffer not kernel, and then copy whole thing into user buffer
         */
        lgrp_snap32 = kmem_zalloc(snap_size, KM_NOSLEEP);
        if (lgrp_snap32 == NULL)
                return (set_errno(ENOMEM));

        /*
         * Calculate pointers into 32-bit copy of snapshot
         * for lgroup info, CPU IDs, pset lgroup bitmask, parents, children,
         * resources, and latency table and buffer
         */
        lgrp_info32 = (lgrp_info32_t *)((uintptr_t)lgrp_snap32 +
            snap_hdr_size);
        lgrp_cpuids32 = (processorid_t *)((uintptr_t)lgrp_info32 + info_size);
        lgrp_set32 = (uint_t *)((uintptr_t)lgrp_cpuids32 + cpuids_size);
        lgrp_parents32 = (uint_t *)((uintptr_t)lgrp_set32 + set_size);
        lgrp_children32 = (uint_t *)((uintptr_t)lgrp_parents32 + parents_size);
        lgrp_rsets32 = (uint_t *)((uintptr_t)lgrp_children32 + children_size);
        lgrp_lats32 = (caddr32_t *)((uintptr_t)lgrp_rsets32 + rsets_size);

        /*
         * Make temporary lgroup latency table of pointers for kernel to use
         * to fill in rows of table with latencies from each lgroup
         */
        lgrp_lats32_kernel =  kmem_zalloc(snap_nlgrpsmax * sizeof (int *),
            KM_NOSLEEP);
        if (lgrp_lats32_kernel == NULL) {
                kmem_free(lgrp_snap32, snap_size);
                return (set_errno(ENOMEM));
        }

        /*
         * Fill in 32-bit lgroup snapshot header
         * (with pointers into user's buffer for lgroup info, CPU IDs,
         * bit masks, and latencies)
         */
        lgrp_snap32->ss_version = lgrp_snap->ss_version;
        lgrp_snap32->ss_levels = lgrp_snap->ss_levels;
        lgrp_snap32->ss_nlgrps = lgrp_snap32->ss_nlgrps_os =
            lgrp_snap->ss_nlgrps;
        lgrp_snap32->ss_nlgrps_max = snap_nlgrpsmax;
        lgrp_snap32->ss_root = lgrp_snap->ss_root;
        lgrp_snap32->ss_ncpus = lgrp_snap->ss_ncpus;
        lgrp_snap32->ss_gen = lgrp_snap->ss_gen;
        lgrp_snap32->ss_view = LGRP_VIEW_OS;
        lgrp_snap32->ss_size = snap_size;
        lgrp_snap32->ss_magic = buf;
        lgrp_snap32->ss_info = buf + snap_hdr_size;
        lgrp_snap32->ss_cpuids = lgrp_snap32->ss_info + info_size;
        lgrp_snap32->ss_lgrpset = lgrp_snap32->ss_cpuids + cpuids_size;
        lgrp_snap32->ss_parents = lgrp_snap32->ss_lgrpset + bitmask_size;
        lgrp_snap32->ss_children = lgrp_snap32->ss_parents +
            (snap_nlgrpsmax * bitmask_size);
        lgrp_snap32->ss_rsets = lgrp_snap32->ss_children +
            (snap_nlgrpsmax * bitmask_size);
        lgrp_snap32->ss_latencies = lgrp_snap32->ss_rsets +
            (LGRP_RSRC_COUNT * snap_nlgrpsmax * bitmask_size);

        /*
         * Fill in lgrpset now because caller may have change psets
         */
        kpreempt_disable();
        for (i = 0; i < snap_nlgrpsmax; i++) {
                if (klgrpset_ismember(curthread->t_cpupart->cp_lgrpset,
                    i)) {
                        BT_SET32(lgrp_set32, i);
                }
        }
        kpreempt_enable();

        /*
         * Fill in 32-bit copy of lgroup info and fix up pointers
         * to point into user's buffer instead of kernel's
         */
        cpu_index = 0;
        lgrp_info = lgrp_snap->ss_info;
        for (i = 0; i < snap_nlgrpsmax; i++) {
                uint_t  *children;
                uint_t  *lgrp_rset;
                uint_t  *parents;
                ulong_t *snap_rset;

                /*
                 * Skip non-existent lgroups
                 */
                if (lgrp_info[i].info_lgrpid == LGRP_NONE) {
                        bzero(&lgrp_info32[i], sizeof (lgrp_info32[i]));
                        lgrp_info32[i].info_lgrpid = LGRP_NONE;
                        continue;
                }

                /*
                 * Fill in parents, children, lgroup resource set, and
                 * latencies from snapshot
                 */
                parents = (uint_t *)((uintptr_t)lgrp_parents32 +
                    i * bitmask_size);
                children = (uint_t *)((uintptr_t)lgrp_children32 +
                    i * bitmask_size);
                snap_rset = (ulong_t *)((uintptr_t)lgrp_snap->ss_rsets +
                    (i * LGRP_RSRC_COUNT * BT_SIZEOFMAP(snap_nlgrpsmax)));
                lgrp_rset = (uint_t *)((uintptr_t)lgrp_rsets32 +
                    (i * LGRP_RSRC_COUNT * bitmask_size));
                lgrp_lats32_kernel[i] = (int *)((uintptr_t)lgrp_lats32 +
                    snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
                    sizeof (int));
                for (j = 0; j < snap_nlgrpsmax; j++) {
                        int     k;
                        uint_t  *rset;

                        if (BT_TEST(&lgrp_snap->ss_parents[i], j))
                                BT_SET32(parents, j);

                        if (BT_TEST(&lgrp_snap->ss_children[i], j))
                                BT_SET32(children, j);

                        for (k = 0; k < LGRP_RSRC_COUNT; k++) {
                                rset = (uint_t *)((uintptr_t)lgrp_rset +
                                    k * bitmask_size);
                                if (BT_TEST(&snap_rset[k], j))
                                        BT_SET32(rset, j);
                        }

                        lgrp_lats32_kernel[i][j] =
                            lgrp_snap->ss_latencies[i][j];
                }

                /*
                 * Fix up pointer to latency buffer
                 */
                lgrp_lats32[i] = lgrp_snap32->ss_latencies +
                    snap_nlgrpsmax * sizeof (caddr32_t) + i * snap_nlgrpsmax *
                    sizeof (int);

                /*
                 * Fix up pointers for parents, children, and resources
                 */
                lgrp_info32[i].info_parents = lgrp_snap32->ss_parents +
                    (i * bitmask_size);
                lgrp_info32[i].info_children = lgrp_snap32->ss_children +
                    (i * bitmask_size);
                lgrp_info32[i].info_rset = lgrp_snap32->ss_rsets +
                    (i * LGRP_RSRC_COUNT * bitmask_size);

                /*
                 * Fill in memory and CPU info
                 * Only fill in memory for lgroups directly containing memory
                 */
                snap_rset = &lgrp_info[i].info_rset[LGRP_RSRC_MEM *
                    BT_BITOUL(snap_nlgrpsmax)];
                if (BT_TEST(snap_rset, i)) {
                        lgrp_info32[i].info_mem_free = lgrp_mem_size(i,
                            LGRP_MEM_SIZE_FREE);
                        lgrp_info32[i].info_mem_install =
                            lgrp_info[i].info_mem_install;
                }

                lgrp_info32[i].info_ncpus = lgrp_info[i].info_ncpus;

                lgrp_info32[i].info_lgrpid = lgrp_info[i].info_lgrpid;
                lgrp_info32[i].info_latency = lgrp_info[i].info_latency;

                if (lgrp_info32[i].info_ncpus == 0) {
                        lgrp_info32[i].info_cpuids = 0;
                        continue;
                }

                /*
                 * Fix up pointer for CPU IDs
                 */
                lgrp_info32[i].info_cpuids = lgrp_snap32->ss_cpuids +
                    (cpu_index * sizeof (processorid_t));
                cpu_index += lgrp_info32[i].info_ncpus;
        }
        ASSERT(cpu_index == snap_ncpus);

        /*
         * Copy lgroup CPU IDs into 32-bit snapshot
         * before copying it out into user's buffer
         */
        bcopy(lgrp_snap->ss_cpuids, lgrp_cpuids32, cpuids_size);

        /*
         * Copy 32-bit lgroup snapshot into user's buffer all at once
         */
        if (copyout(lgrp_snap32, (void *)(uintptr_t)buf, snap_size) != 0) {
                kmem_free(lgrp_snap32, snap_size);
                kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));
                return (set_errno(EFAULT));
        }

        kmem_free(lgrp_snap32, snap_size);
        kmem_free(lgrp_lats32_kernel, snap_nlgrpsmax * sizeof (int *));

        return (snap_size);
}
#endif  /* _SYSCALL32_IMPL */


int
lgrpsys(int subcode, long ia, void *ap)
{
        size_t  bufsize;
        int     latency;

        switch (subcode) {

        case LGRP_SYS_AFFINITY_GET:
                return (lgrp_affinity_get((lgrp_affinity_args_t *)ap));

        case LGRP_SYS_AFFINITY_SET:
                return (lgrp_affinity_set((lgrp_affinity_args_t *)ap));

        case LGRP_SYS_GENERATION:
                return (lgrp_generation(ia));

        case LGRP_SYS_HOME:
                return (lgrp_home_get((idtype_t)ia, (id_t)(uintptr_t)ap));

        case LGRP_SYS_LATENCY:
                mutex_enter(&cpu_lock);
                latency = lgrp_latency(ia, (lgrp_id_t)(uintptr_t)ap);
                mutex_exit(&cpu_lock);
                return (latency);

        case LGRP_SYS_MEMINFO:
                return (meminfo(ia, (struct meminfo *)ap));

        case LGRP_SYS_VERSION:
                return (lgrp_version(ia));

        case LGRP_SYS_SNAPSHOT:
                mutex_enter(&lgrp_snap_lock);
                bufsize = lgrp_snapshot();
                if (ap && ia > 0) {
                        if (get_udatamodel() == DATAMODEL_NATIVE)
                                bufsize = lgrp_snapshot_copy(ap, ia);
#ifdef  _SYSCALL32_IMPL
                        else
                                bufsize = lgrp_snapshot_copy32(
                                    (caddr32_t)(uintptr_t)ap, ia);
#endif  /* _SYSCALL32_IMPL */
                }
                mutex_exit(&lgrp_snap_lock);
                return (bufsize);

        default:
                break;

        }

        return (set_errno(EINVAL));
}