kernel/cgroup/cpuset-v1.c

root/kernel/cgroup/cpuset-v1.c
// SPDX-License-Identifier: GPL-2.0-or-later

#include "cgroup-internal.h"
#include "cpuset-internal.h"

/*
 * Legacy hierarchy call to cgroup_transfer_tasks() is handled asynchrously
 */
struct cpuset_remove_tasks_struct {
        struct work_struct work;
        struct cpuset *cs;
};

/*
 * Frequency meter - How fast is some event occurring?
 *
 * These routines manage a digitally filtered, constant time based,
 * event frequency meter.  There are four routines:
 *   fmeter_init() - initialize a frequency meter.
 *   fmeter_markevent() - called each time the event happens.
 *   fmeter_getrate() - returns the recent rate of such events.
 *   fmeter_update() - internal routine used to update fmeter.
 *
 * A common data structure is passed to each of these routines,
 * which is used to keep track of the state required to manage the
 * frequency meter and its digital filter.
 *
 * The filter works on the number of events marked per unit time.
 * The filter is single-pole low-pass recursive (IIR).  The time unit
 * is 1 second.  Arithmetic is done using 32-bit integers scaled to
 * simulate 3 decimal digits of precision (multiplied by 1000).
 *
 * With an FM_COEF of 933, and a time base of 1 second, the filter
 * has a half-life of 10 seconds, meaning that if the events quit
 * happening, then the rate returned from the fmeter_getrate()
 * will be cut in half each 10 seconds, until it converges to zero.
 *
 * It is not worth doing a real infinitely recursive filter.  If more
 * than FM_MAXTICKS ticks have elapsed since the last filter event,
 * just compute FM_MAXTICKS ticks worth, by which point the level
 * will be stable.
 *
 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
 * arithmetic overflow in the fmeter_update() routine.
 *
 * Given the simple 32 bit integer arithmetic used, this meter works
 * best for reporting rates between one per millisecond (msec) and
 * one per 32 (approx) seconds.  At constant rates faster than one
 * per msec it maxes out at values just under 1,000,000.  At constant
 * rates between one per msec, and one per second it will stabilize
 * to a value N*1000, where N is the rate of events per second.
 * At constant rates between one per second and one per 32 seconds,
 * it will be choppy, moving up on the seconds that have an event,
 * and then decaying until the next event.  At rates slower than
 * about one in 32 seconds, it decays all the way back to zero between
 * each event.
 */

#define FM_COEF 933             /* coefficient for half-life of 10 secs */
#define FM_MAXTICKS ((u32)99)   /* useless computing more ticks than this */
#define FM_MAXCNT 1000000       /* limit cnt to avoid overflow */
#define FM_SCALE 1000           /* faux fixed point scale */

/* Initialize a frequency meter */
static void fmeter_init(struct fmeter *fmp)
{
        fmp->cnt = 0;
        fmp->val = 0;
        fmp->time = 0;
        spin_lock_init(&fmp->lock);
}

/* Internal meter update - process cnt events and update value */
static void fmeter_update(struct fmeter *fmp)
{
        time64_t now;
        u32 ticks;

        now = ktime_get_seconds();
        ticks = now - fmp->time;

        if (ticks == 0)
                return;

        ticks = min(FM_MAXTICKS, ticks);
        while (ticks-- > 0)
                fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
        fmp->time = now;

        fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
        fmp->cnt = 0;
}

/* Process any previous ticks, then bump cnt by one (times scale). */
static void fmeter_markevent(struct fmeter *fmp)
{
        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
        spin_unlock(&fmp->lock);
}

/* Process any previous ticks, then return current value. */
static int fmeter_getrate(struct fmeter *fmp)
{
        int val;

        spin_lock(&fmp->lock);
        fmeter_update(fmp);
        val = fmp->val;
        spin_unlock(&fmp->lock);
        return val;
}

/*
 * Collection of memory_pressure is suppressed unless
 * this flag is enabled by writing "1" to the special
 * cpuset file 'memory_pressure_enabled' in the root cpuset.
 */

int cpuset_memory_pressure_enabled __read_mostly;

/*
 * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
 *
 * Keep a running average of the rate of synchronous (direct)
 * page reclaim efforts initiated by tasks in each cpuset.
 *
 * This represents the rate at which some task in the cpuset
 * ran low on memory on all nodes it was allowed to use, and
 * had to enter the kernels page reclaim code in an effort to
 * create more free memory by tossing clean pages or swapping
 * or writing dirty pages.
 *
 * Display to user space in the per-cpuset read-only file
 * "memory_pressure".  Value displayed is an integer
 * representing the recent rate of entry into the synchronous
 * (direct) page reclaim by any task attached to the cpuset.
 */

void __cpuset_memory_pressure_bump(void)
{
        rcu_read_lock();
        fmeter_markevent(&task_cs(current)->fmeter);
        rcu_read_unlock();
}

static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
        if (val < -1 || val > sched_domain_level_max + 1)
                return -EINVAL;
#endif

        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpumask_empty(cs->cpus_allowed) &&
                    is_sched_load_balance(cs))
                        rebuild_sched_domains_locked();
        }

        return 0;
}

static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
                            s64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = -ENODEV;

        cpuset_full_lock();
        if (!is_cpuset_online(cs))
                goto out_unlock;

        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                pr_info_once("cpuset.%s is deprecated\n", cft->name);
                retval = update_relax_domain_level(cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        cpuset_full_unlock();
        return retval;
}

static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;

        switch (type) {
        case FILE_SCHED_RELAX_DOMAIN_LEVEL:
                return cs->relax_domain_level;
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

/*
 * update task's spread flag if cpuset's page/slab spread flag is set
 *
 * Call with callback_lock or cpuset_mutex held. The check can be skipped
 * if on default hierarchy.
 */
void cpuset1_update_task_spread_flags(struct cpuset *cs,
                                        struct task_struct *tsk)
{
        if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
                return;

        if (is_spread_page(cs))
                task_set_spread_page(tsk);
        else
                task_clear_spread_page(tsk);

        if (is_spread_slab(cs))
                task_set_spread_slab(tsk);
        else
                task_clear_spread_slab(tsk);
}

/**
 * cpuset1_update_tasks_flags - update the spread flags of tasks in the cpuset.
 * @cs: the cpuset in which each task's spread flags needs to be changed
 *
 * Iterate through each task of @cs updating its spread flags.  As this
 * function is called with cpuset_mutex held, cpuset membership stays
 * stable.
 */
void cpuset1_update_tasks_flags(struct cpuset *cs)
{
        struct css_task_iter it;
        struct task_struct *task;

        css_task_iter_start(&cs->css, 0, &it);
        while ((task = css_task_iter_next(&it)))
                cpuset1_update_task_spread_flags(cs, task);
        css_task_iter_end(&it);
}

/*
 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
 * cpuset to its next-highest non-empty parent.
 */
static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
{
        struct cpuset *parent;

        /*
         * Find its next-highest non-empty parent, (top cpuset
         * has online cpus, so can't be empty).
         */
        parent = parent_cs(cs);
        while (cpumask_empty(parent->cpus_allowed) ||
                        nodes_empty(parent->mems_allowed))
                parent = parent_cs(parent);

        if (cgroup_transfer_tasks(parent->css.cgroup, cs->css.cgroup)) {
                pr_err("cpuset: failed to transfer tasks out of empty cpuset ");
                pr_cont_cgroup_name(cs->css.cgroup);
                pr_cont("\n");
        }
}

static void cpuset_migrate_tasks_workfn(struct work_struct *work)
{
        struct cpuset_remove_tasks_struct *s;

        s = container_of(work, struct cpuset_remove_tasks_struct, work);
        remove_tasks_in_empty_cpuset(s->cs);
        css_put(&s->cs->css);
        kfree(s);
}

void cpuset1_hotplug_update_tasks(struct cpuset *cs,
                            struct cpumask *new_cpus, nodemask_t *new_mems,
                            bool cpus_updated, bool mems_updated)
{
        bool is_empty;

        cpuset_callback_lock_irq();
        cpumask_copy(cs->cpus_allowed, new_cpus);
        cpumask_copy(cs->effective_cpus, new_cpus);
        cs->mems_allowed = *new_mems;
        cs->effective_mems = *new_mems;
        cpuset_callback_unlock_irq();

        /*
         * Don't call cpuset_update_tasks_cpumask() if the cpuset becomes empty,
         * as the tasks will be migrated to an ancestor.
         */
        if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
                cpuset_update_tasks_cpumask(cs, new_cpus);
        if (mems_updated && !nodes_empty(cs->mems_allowed))
                cpuset_update_tasks_nodemask(cs);

        is_empty = cpumask_empty(cs->cpus_allowed) ||
                   nodes_empty(cs->mems_allowed);

        /*
         * Move tasks to the nearest ancestor with execution resources,
         * This is full cgroup operation which will also call back into
         * cpuset. Execute it asynchronously using workqueue.
         */
        if (is_empty && cs->css.cgroup->nr_populated_csets &&
            css_tryget_online(&cs->css)) {
                struct cpuset_remove_tasks_struct *s;

                s = kzalloc_obj(*s);
                if (WARN_ON_ONCE(!s)) {
                        css_put(&cs->css);
                        return;
                }

                s->cs = cs;
                INIT_WORK(&s->work, cpuset_migrate_tasks_workfn);
                schedule_work(&s->work);
        }
}

/*
 * is_cpuset_subset(p, q) - Is cpuset p a subset of cpuset q?
 *
 * One cpuset is a subset of another if all its allowed CPUs and
 * Memory Nodes are a subset of the other, and its exclusive flags
 * are only set if the other's are set.  Call holding cpuset_mutex.
 */

static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
        return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
}

/*
 * cpuset1_validate_change() - Validate conditions specific to legacy (v1)
 *                            behavior.
 */
int cpuset1_validate_change(struct cpuset *cur, struct cpuset *trial)
{
        struct cgroup_subsys_state *css;
        struct cpuset *c, *par;
        int ret;

        WARN_ON_ONCE(!rcu_read_lock_held());

        /* Each of our child cpusets must be a subset of us */
        ret = -EBUSY;
        cpuset_for_each_child(c, css, cur)
                if (!is_cpuset_subset(c, trial))
                        goto out;

        /* On legacy hierarchy, we must be a subset of our parent cpuset. */
        ret = -EACCES;
        par = parent_cs(cur);
        if (par && !is_cpuset_subset(trial, par))
                goto out;

        /*
         * Cpusets with tasks - existing or newly being attached - can't
         * be changed to have empty cpus_allowed or mems_allowed.
         */
        ret = -ENOSPC;
        if (cpuset_is_populated(cur)) {
                if (!cpumask_empty(cur->cpus_allowed) &&
                    cpumask_empty(trial->cpus_allowed))
                        goto out;
                if (!nodes_empty(cur->mems_allowed) &&
                    nodes_empty(trial->mems_allowed))
                        goto out;
        }

        ret = 0;
out:
        return ret;
}

/*
 * cpuset1_cpus_excl_conflict() - Check if two cpusets have exclusive CPU conflicts
 *                                to legacy (v1)
 * @cs1: first cpuset to check
 * @cs2: second cpuset to check
 *
 * Returns: true if CPU exclusivity conflict exists, false otherwise
 *
 * If either cpuset is CPU exclusive, their allowed CPUs cannot intersect.
 */
bool cpuset1_cpus_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
{
        if (is_cpu_exclusive(cs1) || is_cpu_exclusive(cs2))
                return cpumask_intersects(cs1->cpus_allowed,
                                          cs2->cpus_allowed);

        return false;
}

#ifdef CONFIG_PROC_PID_CPUSET
/*
 * proc_cpuset_show()
 *  - Print tasks cpuset path into seq_file.
 *  - Used for /proc/<pid>/cpuset.
 */
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
                     struct pid *pid, struct task_struct *tsk)
{
        char *buf;
        struct cgroup_subsys_state *css;
        int retval;

        retval = -ENOMEM;
        buf = kmalloc(PATH_MAX, GFP_KERNEL);
        if (!buf)
                goto out;

        rcu_read_lock();
        spin_lock_irq(&css_set_lock);
        css = task_css(tsk, cpuset_cgrp_id);
        retval = cgroup_path_ns_locked(css->cgroup, buf, PATH_MAX,
                                       current->nsproxy->cgroup_ns);
        spin_unlock_irq(&css_set_lock);
        rcu_read_unlock();

        if (retval == -E2BIG)
                retval = -ENAMETOOLONG;
        if (retval < 0)
                goto out_free;
        seq_puts(m, buf);
        seq_putc(m, '\n');
        retval = 0;
out_free:
        kfree(buf);
out:
        return retval;
}
#endif /* CONFIG_PROC_PID_CPUSET */

static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;

        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                return is_cpu_exclusive(cs);
        case FILE_MEM_EXCLUSIVE:
                return is_mem_exclusive(cs);
        case FILE_MEM_HARDWALL:
                return is_mem_hardwall(cs);
        case FILE_SCHED_LOAD_BALANCE:
                return is_sched_load_balance(cs);
        case FILE_MEMORY_MIGRATE:
                return is_memory_migrate(cs);
        case FILE_MEMORY_PRESSURE_ENABLED:
                return cpuset_memory_pressure_enabled;
        case FILE_MEMORY_PRESSURE:
                return fmeter_getrate(&cs->fmeter);
        case FILE_SPREAD_PAGE:
                return is_spread_page(cs);
        case FILE_SPREAD_SLAB:
                return is_spread_slab(cs);
        default:
                BUG();
        }

        /* Unreachable but makes gcc happy */
        return 0;
}

static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
                            u64 val)
{
        struct cpuset *cs = css_cs(css);
        cpuset_filetype_t type = cft->private;
        int retval = 0;

        cpuset_full_lock();
        if (!is_cpuset_online(cs)) {
                retval = -ENODEV;
                goto out_unlock;
        }

        switch (type) {
        case FILE_CPU_EXCLUSIVE:
                retval = cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_EXCLUSIVE:
                pr_info_once("cpuset.%s is deprecated\n", cft->name);
                retval = cpuset_update_flag(CS_MEM_EXCLUSIVE, cs, val);
                break;
        case FILE_MEM_HARDWALL:
                pr_info_once("cpuset.%s is deprecated\n", cft->name);
                retval = cpuset_update_flag(CS_MEM_HARDWALL, cs, val);
                break;
        case FILE_SCHED_LOAD_BALANCE:
                pr_info_once("cpuset.%s is deprecated, use cpuset.cpus.partition instead\n", cft->name);
                retval = cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, val);
                break;
        case FILE_MEMORY_MIGRATE:
                pr_info_once("cpuset.%s is deprecated\n", cft->name);
                retval = cpuset_update_flag(CS_MEMORY_MIGRATE, cs, val);
                break;
        case FILE_MEMORY_PRESSURE_ENABLED:
                pr_info_once("cpuset.%s is deprecated, use memory.pressure with CONFIG_PSI instead\n", cft->name);
                cpuset_memory_pressure_enabled = !!val;
                break;
        case FILE_SPREAD_PAGE:
                pr_info_once("cpuset.%s is deprecated\n", cft->name);
                retval = cpuset_update_flag(CS_SPREAD_PAGE, cs, val);
                break;
        case FILE_SPREAD_SLAB:
                pr_warn_once("cpuset.%s is deprecated\n", cft->name);
                retval = cpuset_update_flag(CS_SPREAD_SLAB, cs, val);
                break;
        default:
                retval = -EINVAL;
                break;
        }
out_unlock:
        cpuset_full_unlock();
        return retval;
}

void cpuset1_init(struct cpuset *cs)
{
        fmeter_init(&cs->fmeter);
        cs->relax_domain_level = -1;
}

void cpuset1_online_css(struct cgroup_subsys_state *css)
{
        struct cpuset *tmp_cs;
        struct cgroup_subsys_state *pos_css;
        struct cpuset *cs = css_cs(css);
        struct cpuset *parent = parent_cs(cs);

        lockdep_assert_cpus_held();
        lockdep_assert_cpuset_lock_held();

        if (is_spread_page(parent))
                set_bit(CS_SPREAD_PAGE, &cs->flags);
        if (is_spread_slab(parent))
                set_bit(CS_SPREAD_SLAB, &cs->flags);

        if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags))
                return;

        /*
         * Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
         * set.  This flag handling is implemented in cgroup core for
         * historical reasons - the flag may be specified during mount.
         *
         * Currently, if any sibling cpusets have exclusive cpus or mem, we
         * refuse to clone the configuration - thereby refusing the task to
         * be entered, and as a result refusing the sys_unshare() or
         * clone() which initiated it.  If this becomes a problem for some
         * users who wish to allow that scenario, then this could be
         * changed to grant parent->cpus_allowed-sibling_cpus_exclusive
         * (and likewise for mems) to the new cgroup.
         */
        rcu_read_lock();
        cpuset_for_each_child(tmp_cs, pos_css, parent) {
                if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) {
                        rcu_read_unlock();
                        return;
                }
        }
        rcu_read_unlock();

        cpuset_callback_lock_irq();
        cs->mems_allowed = parent->mems_allowed;
        cs->effective_mems = parent->mems_allowed;
        cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
        cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
        cpuset_callback_unlock_irq();
}

static void
update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
{
        if (dattr->relax_domain_level < c->relax_domain_level)
                dattr->relax_domain_level = c->relax_domain_level;
}

static void update_domain_attr_tree(struct sched_domain_attr *dattr,
                                    struct cpuset *root_cs)
{
        struct cpuset *cp;
        struct cgroup_subsys_state *pos_css;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, root_cs) {
                /* skip the whole subtree if @cp doesn't have any CPU */
                if (cpumask_empty(cp->cpus_allowed)) {
                        pos_css = css_rightmost_descendant(pos_css);
                        continue;
                }

                if (is_sched_load_balance(cp))
                        update_domain_attr(dattr, cp);
        }
        rcu_read_unlock();
}

/*
 * cpuset1_generate_sched_domains()
 *
 * Finding the best partition (set of domains):
 *      The double nested loops below over i, j scan over the load
 *      balanced cpusets (using the array of cpuset pointers in csa[])
 *      looking for pairs of cpusets that have overlapping cpus_allowed
 *      and merging them using a union-find algorithm.
 *
 *      The union of the cpus_allowed masks from the set of all cpusets
 *      having the same root then form the one element of the partition
 *      (one sched domain) to be passed to partition_sched_domains().
 */
int cpuset1_generate_sched_domains(cpumask_var_t **domains,
                        struct sched_domain_attr **attributes)
{
        struct cpuset *cp;      /* top-down scan of cpusets */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
        int i, j;               /* indices for partition finding loops */
        cpumask_var_t *doms;    /* resulting partition; i.e. sched domains */
        struct sched_domain_attr *dattr;  /* attributes for custom domains */
        int ndoms = 0;          /* number of sched domains in result */
        int nslot;              /* next empty doms[] struct cpumask slot */
        struct cgroup_subsys_state *pos_css;
        int nslot_update;

        lockdep_assert_cpuset_lock_held();

        doms = NULL;
        dattr = NULL;
        csa = NULL;

        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
                ndoms = 1;
                doms = alloc_sched_domains(ndoms);
                if (!doms)
                        goto done;

                dattr = kmalloc_obj(struct sched_domain_attr);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                cpumask_and(doms[0], top_cpuset.effective_cpus,
                            housekeeping_cpumask(HK_TYPE_DOMAIN));

                goto done;
        }

        csa = kmalloc_objs(cp, nr_cpusets());
        if (!csa)
                goto done;
        csn = 0;

        rcu_read_lock();
        cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
                if (cp == &top_cpuset)
                        continue;

                /*
                 * Continue traversing beyond @cp iff @cp has some CPUs and
                 * isn't load balancing.  The former is obvious.  The
                 * latter: All child cpusets contain a subset of the
                 * parent's cpus, so just skip them, and then we call
                 * update_domain_attr_tree() to calc relax_domain_level of
                 * the corresponding sched domain.
                 */
                if (!cpumask_empty(cp->cpus_allowed) &&
                    !(is_sched_load_balance(cp) &&
                      cpumask_intersects(cp->cpus_allowed,
                                         housekeeping_cpumask(HK_TYPE_DOMAIN))))
                        continue;

                if (is_sched_load_balance(cp) &&
                    !cpumask_empty(cp->effective_cpus))
                        csa[csn++] = cp;

                /* skip @cp's subtree */
                pos_css = css_rightmost_descendant(pos_css);
                continue;
        }
        rcu_read_unlock();

        for (i = 0; i < csn; i++)
                uf_node_init(&csa[i]->node);

        /* Merge overlapping cpusets */
        for (i = 0; i < csn; i++) {
                for (j = i + 1; j < csn; j++) {
                        if (cpusets_overlap(csa[i], csa[j]))
                                uf_union(&csa[i]->node, &csa[j]->node);
                }
        }

        /* Count the total number of domains */
        for (i = 0; i < csn; i++) {
                if (uf_find(&csa[i]->node) == &csa[i]->node)
                        ndoms++;
        }

        /*
         * Now we know how many domains to create.
         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
         */
        doms = alloc_sched_domains(ndoms);
        if (!doms)
                goto done;

        /*
         * The rest of the code, including the scheduler, can deal with
         * dattr==NULL case. No need to abort if alloc fails.
         */
        dattr = kmalloc_objs(struct sched_domain_attr, ndoms);

        for (nslot = 0, i = 0; i < csn; i++) {
                nslot_update = 0;
                for (j = i; j < csn; j++) {
                        if (uf_find(&csa[j]->node) == &csa[i]->node) {
                                struct cpumask *dp = doms[nslot];

                                if (i == j) {
                                        nslot_update = 1;
                                        cpumask_clear(dp);
                                        if (dattr)
                                                *(dattr + nslot) = SD_ATTR_INIT;
                                }
                                cpumask_or(dp, dp, csa[j]->effective_cpus);
                                cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
                                if (dattr)
                                        update_domain_attr_tree(dattr + nslot, csa[j]);
                        }
                }
                if (nslot_update)
                        nslot++;
        }
        BUG_ON(nslot != ndoms);

done:
        kfree(csa);

        /*
         * Fallback to the default domain if kmalloc() failed.
         * See comments in partition_sched_domains().
         */
        if (doms == NULL)
                ndoms = 1;

        *domains    = doms;
        *attributes = dattr;
        return ndoms;
}

/*
 * for the common functions, 'private' gives the type of file
 */

struct cftype cpuset1_files[] = {
        {
                .name = "cpus",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * NR_CPUS),
                .private = FILE_CPULIST,
        },

        {
                .name = "mems",
                .seq_show = cpuset_common_seq_show,
                .write = cpuset_write_resmask,
                .max_write_len = (100U + 6 * MAX_NUMNODES),
                .private = FILE_MEMLIST,
        },

        {
                .name = "effective_cpus",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_CPULIST,
        },

        {
                .name = "effective_mems",
                .seq_show = cpuset_common_seq_show,
                .private = FILE_EFFECTIVE_MEMLIST,
        },

        {
                .name = "cpu_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_CPU_EXCLUSIVE,
        },

        {
                .name = "mem_exclusive",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_EXCLUSIVE,
        },

        {
                .name = "mem_hardwall",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEM_HARDWALL,
        },

        {
                .name = "sched_load_balance",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SCHED_LOAD_BALANCE,
        },

        {
                .name = "sched_relax_domain_level",
                .read_s64 = cpuset_read_s64,
                .write_s64 = cpuset_write_s64,
                .private = FILE_SCHED_RELAX_DOMAIN_LEVEL,
        },

        {
                .name = "memory_migrate",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_MIGRATE,
        },

        {
                .name = "memory_pressure",
                .read_u64 = cpuset_read_u64,
                .private = FILE_MEMORY_PRESSURE,
        },

        {
                .name = "memory_spread_page",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_PAGE,
        },

        {
                /* obsolete, may be removed in the future */
                .name = "memory_spread_slab",
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_SPREAD_SLAB,
        },

        {
                .name = "memory_pressure_enabled",
                .flags = CFTYPE_ONLY_ON_ROOT,
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_PRESSURE_ENABLED,
        },

        { }     /* terminate */
};
Linux