root/usr/src/uts/common/disp/cmt_policy.c
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */

#include <sys/systm.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/thread.h>
#include <sys/cpuvar.h>
#include <sys/cpupart.h>
#include <sys/cmn_err.h>
#include <sys/disp.h>
#include <sys/group.h>
#include <sys/bitset.h>
#include <sys/lgrp.h>
#include <sys/cmt.h>

/*
 * CMT dispatcher policies
 *
 * This file implements CMT dispatching policies using Processor Groups.
 *
 * The scheduler/dispatcher leverages knowledge of the performance
 * relevant CMT sharing relationships existing between CPUs to implement
 * load balancing, and coalescence thread placement policies.
 *
 * Load balancing policy seeks to improve performance by minimizing
 * contention over shared processor resources / facilities. Coalescence
 * policies improve resource utilization and ultimately power efficiency.
 *
 * On NUMA systems, the dispatcher will generally perform load balancing and
 * coalescence within (and not across) lgroups. This is because there isn't
 * much sense in trying to correct an imbalance by sending a thread outside
 * of its home, if it would attempt to return home a short while later.
 * The dispatcher will implement CMT policy across lgroups however, if
 * it can do so with a thread homed to the root lgroup, since root homed
 * threads have no lgroup affinity.
 */

/*
 * Return non-zero if, given the policy, we should migrate from running
 * somewhere "here" to somewhere "there".
 */
static int
cmt_should_migrate(pg_cmt_t *here, pg_cmt_t *there, pg_cmt_policy_t policy,
    int self)
{
        uint32_t here_util, there_util;

        here_util = here->cmt_utilization;
        there_util = there->cmt_utilization;

        /*
         * This assumes that curthread's utilization is "1"
         */
        if (self && bitset_in_set(&here->cmt_cpus_actv_set, CPU->cpu_seqid))
                here_util--;    /* Ignore curthread's effect */

        /*
         * Load balancing and coalescence are conflicting policies
         */
        ASSERT((policy & (CMT_BALANCE|CMT_COALESCE)) !=
            (CMT_BALANCE|CMT_COALESCE));

        if (policy & CMT_BALANCE) {
                /*
                 * Balance utilization
                 *
                 * If the target is comparatively underutilized
                 * (either in an absolute sense, or scaled by capacity),
                 * then choose to balance.
                 */
                if ((here_util > there_util) ||
                    (here_util == there_util &&
                    (CMT_CAPACITY(there) > CMT_CAPACITY(here)))) {
                        return (1);
                }
        } else if (policy & CMT_COALESCE) {
                /*
                 * Attempt to drive group utilization up to capacity
                 */
                if (there_util > here_util &&
                    there_util < CMT_CAPACITY(there))
                        return (1);
        }
        return (0);
}

/*
 * Perform multi-level CMT load balancing of running threads.
 *
 * tp is the thread being enqueued.
 * cp is a hint CPU, against which CMT load balancing will be performed.
 *
 * Returns cp, or a CPU better than cp with respect to balancing
 * running thread load.
 */
cpu_t *
cmt_balance(kthread_t *tp, cpu_t *cp)
{
        int             hint, i, cpu, nsiblings;
        int             self = 0;
        group_t         *cmt_pgs, *siblings;
        pg_cmt_t        *pg, *pg_tmp, *tpg = NULL;
        int             level = 0;
        cpu_t           *newcp;
        extern cmt_lgrp_t *cmt_root;

        ASSERT(THREAD_LOCK_HELD(tp));

        cmt_pgs = &cp->cpu_pg->cmt_pgs;

        if (GROUP_SIZE(cmt_pgs) == 0)
                return (cp);    /* nothing to do */

        if (tp == curthread)
                self = 1;

        /*
         * Balance across siblings in the CPUs CMT lineage
         * If the thread is homed to the root lgroup, perform
         * top level balancing against other top level PGs
         * in the system. Otherwise, start with the default
         * top level siblings group, which is within the leaf lgroup
         */
        pg = GROUP_ACCESS(cmt_pgs, level);
        if (tp->t_lpl->lpl_lgrpid == LGRP_ROOTID)
                siblings = &cmt_root->cl_pgs;
        else
                siblings = pg->cmt_siblings;

        /*
         * Traverse down the lineage until we find a level that needs
         * balancing, or we get to the end.
         */
        for (;;) {
                nsiblings = GROUP_SIZE(siblings);       /* self inclusive */
                if (nsiblings == 1)
                        goto next_level;

                hint = CPU_PSEUDO_RANDOM() % nsiblings;

                /*
                 * Find a balancing candidate from among our siblings
                 * "hint" is a hint for where to start looking
                 */
                i = hint;
                do {
                        ASSERT(i < nsiblings);
                        pg_tmp = GROUP_ACCESS(siblings, i);

                        /*
                         * The candidate must not be us, and must
                         * have some CPU resources in the thread's
                         * partition
                         */
                        if (pg_tmp != pg &&
                            bitset_in_set(&tp->t_cpupart->cp_cmt_pgs,
                            ((pg_t *)pg_tmp)->pg_id)) {
                                tpg = pg_tmp;
                                break;
                        }

                        if (++i >= nsiblings)
                                i = 0;
                } while (i != hint);

                if (!tpg)
                        goto next_level; /* no candidates at this level */

                /*
                 * Decide if we should migrate from the current PG to a
                 * target PG given a policy
                 */
                if (cmt_should_migrate(pg, tpg, pg->cmt_policy, self))
                        break;
                tpg = NULL;

next_level:
                if (++level == GROUP_SIZE(cmt_pgs))
                        break;

                pg = GROUP_ACCESS(cmt_pgs, level);
                siblings = pg->cmt_siblings;
        }

        if (tpg) {
                uint_t  tgt_size = GROUP_SIZE(&tpg->cmt_cpus_actv);

                /*
                 * Select an idle CPU from the target
                 */
                hint = CPU_PSEUDO_RANDOM() % tgt_size;
                cpu = hint;
                do {
                        newcp = GROUP_ACCESS(&tpg->cmt_cpus_actv, cpu);
                        if (newcp->cpu_part == tp->t_cpupart &&
                            newcp->cpu_dispatch_pri == -1) {
                                cp = newcp;
                                break;
                        }
                        if (++cpu == tgt_size)
                                cpu = 0;
                } while (cpu != hint);
        }

        return (cp);
}