#include "cpuset-internal.h"
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
#include <linux/memory.h>
#include <linux/rcupdate.h>
#include <linux/sched.h>
#include <linux/sched/deadline.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
#include <linux/security.h>
#include <linux/oom.h>
#include <linux/sched/isolation.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
#include <linux/task_work.h>
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
static const char * const perr_strings[] = {
[PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus.exclusive",
[PERR_INVPARENT] = "Parent is an invalid partition root",
[PERR_NOTPART] = "Parent is not a partition root",
[PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
[PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
[PERR_HOTPLUG] = "No cpu available due to hotplug",
[PERR_CPUSEMPTY] = "cpuset.cpus and cpuset.cpus.exclusive are empty",
[PERR_HKEEPING] = "partition config conflicts with housekeeping setup",
[PERR_ACCESS] = "Enable partition not permitted",
[PERR_REMOTE] = "Have remote partition underneath",
};
static DEFINE_MUTEX(cpuset_top_mutex);
static DEFINE_MUTEX(cpuset_mutex);
static cpumask_var_t subpartitions_cpus;
static cpumask_var_t isolated_cpus;
static bool update_housekeeping;
static cpumask_var_t isolated_hk_cpus;
static bool force_sd_rebuild;
#define PRS_MEMBER 0
#define PRS_ROOT 1
#define PRS_ISOLATED 2
#define PRS_INVALID_ROOT -1
#define PRS_INVALID_ISOLATED -2
struct tmpmasks {
cpumask_var_t addmask, delmask;
cpumask_var_t new_cpus;
};
void inc_dl_tasks_cs(struct task_struct *p)
{
struct cpuset *cs = task_cs(p);
cs->nr_deadline_tasks++;
}
void dec_dl_tasks_cs(struct task_struct *p)
{
struct cpuset *cs = task_cs(p);
cs->nr_deadline_tasks--;
}
static inline bool is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
static inline bool is_partition_invalid(const struct cpuset *cs)
{
return cs->partition_root_state < 0;
}
static inline bool cs_is_member(const struct cpuset *cs)
{
return cs->partition_root_state == PRS_MEMBER;
}
static inline void make_partition_invalid(struct cpuset *cs)
{
if (cs->partition_root_state > 0)
cs->partition_root_state = -cs->partition_root_state;
}
static inline void notify_partition_change(struct cpuset *cs, int old_prs)
{
if (old_prs == cs->partition_root_state)
return;
cgroup_file_notify(&cs->partition_file);
if (is_partition_valid(cs))
WRITE_ONCE(cs->prs_err, PERR_NONE);
}
struct cpuset top_cpuset = {
.flags = BIT(CS_CPU_EXCLUSIVE) |
BIT(CS_MEM_EXCLUSIVE) | BIT(CS_SCHED_LOAD_BALANCE),
.partition_root_state = PRS_ROOT,
};
void cpuset_lock(void)
{
mutex_lock(&cpuset_mutex);
}
void cpuset_unlock(void)
{
mutex_unlock(&cpuset_mutex);
}
void lockdep_assert_cpuset_lock_held(void)
{
lockdep_assert_held(&cpuset_mutex);
}
void cpuset_full_lock(void)
{
mutex_lock(&cpuset_top_mutex);
cpus_read_lock();
mutex_lock(&cpuset_mutex);
}
void cpuset_full_unlock(void)
{
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
mutex_unlock(&cpuset_top_mutex);
}
#ifdef CONFIG_LOCKDEP
bool lockdep_is_cpuset_held(void)
{
return lockdep_is_held(&cpuset_mutex) ||
lockdep_is_held(&cpuset_top_mutex);
}
#endif
static DEFINE_SPINLOCK(callback_lock);
void cpuset_callback_lock_irq(void)
{
spin_lock_irq(&callback_lock);
}
void cpuset_callback_unlock_irq(void)
{
spin_unlock_irq(&callback_lock);
}
static struct workqueue_struct *cpuset_migrate_mm_wq;
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
static inline void check_insane_mems_config(nodemask_t *nodes)
{
if (!cpusets_insane_config() &&
movable_only_nodes(nodes)) {
static_branch_enable_cpuslocked(&cpusets_insane_config_key);
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
"Cpuset allocations might fail even with a lot of memory available.\n",
nodemask_pr_args(nodes));
}
}
static inline void dec_attach_in_progress_locked(struct cpuset *cs)
{
lockdep_assert_cpuset_lock_held();
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
}
static inline void dec_attach_in_progress(struct cpuset *cs)
{
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
mutex_unlock(&cpuset_mutex);
}
static inline bool cpuset_v2(void)
{
return !IS_ENABLED(CONFIG_CPUSETS_V1) ||
cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
}
static inline bool is_in_v2_mode(void)
{
return cpuset_v2() ||
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
static inline bool partition_is_populated(struct cpuset *cs,
struct cpuset *excluded_child)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
if (cs->css.cgroup->nr_populated_csets ||
cs->attach_in_progress)
return true;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
if (cp == cs || cp == excluded_child)
continue;
if (is_partition_valid(cp)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (cpuset_is_populated(cp)) {
rcu_read_unlock();
return true;
}
}
rcu_read_unlock();
return false;
}
static void guarantee_active_cpus(struct task_struct *tsk,
struct cpumask *pmask)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
struct cpuset *cs;
if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_active_mask)))
cpumask_copy(pmask, cpu_active_mask);
rcu_read_lock();
cs = task_cs(tsk);
while (!cpumask_intersects(cs->effective_cpus, pmask))
cs = parent_cs(cs);
cpumask_and(pmask, pmask, cs->effective_cpus);
rcu_read_unlock();
}
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
while (!nodes_and(*pmask, cs->effective_mems, node_states[N_MEMORY]))
cs = parent_cs(cs);
}
static inline int alloc_cpumasks(cpumask_var_t *pmasks[], u32 size)
{
int i;
for (i = 0; i < size; i++) {
if (!zalloc_cpumask_var(pmasks[i], GFP_KERNEL)) {
while (--i >= 0)
free_cpumask_var(*pmasks[i]);
return -ENOMEM;
}
}
return 0;
}
static inline int alloc_tmpmasks(struct tmpmasks *tmp)
{
cpumask_var_t *pmask[3] = {
&tmp->new_cpus,
&tmp->addmask,
&tmp->delmask
};
return alloc_cpumasks(pmask, ARRAY_SIZE(pmask));
}
static inline void free_tmpmasks(struct tmpmasks *tmp)
{
if (!tmp)
return;
free_cpumask_var(tmp->new_cpus);
free_cpumask_var(tmp->addmask);
free_cpumask_var(tmp->delmask);
}
static struct cpuset *dup_or_alloc_cpuset(struct cpuset *cs)
{
struct cpuset *trial;
trial = cs ? kmemdup(cs, sizeof(*cs), GFP_KERNEL) :
kzalloc_obj(*cs);
if (!trial)
return NULL;
cpumask_var_t *pmask[4] = {
&trial->cpus_allowed,
&trial->effective_cpus,
&trial->effective_xcpus,
&trial->exclusive_cpus
};
if (alloc_cpumasks(pmask, ARRAY_SIZE(pmask))) {
kfree(trial);
return NULL;
}
if (cs) {
cpumask_copy(trial->cpus_allowed, cs->cpus_allowed);
cpumask_copy(trial->effective_cpus, cs->effective_cpus);
cpumask_copy(trial->effective_xcpus, cs->effective_xcpus);
cpumask_copy(trial->exclusive_cpus, cs->exclusive_cpus);
}
return trial;
}
static inline void free_cpuset(struct cpuset *cs)
{
free_cpumask_var(cs->cpus_allowed);
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->effective_xcpus);
free_cpumask_var(cs->exclusive_cpus);
kfree(cs);
}
static inline struct cpumask *user_xcpus(struct cpuset *cs)
{
return cpumask_empty(cs->exclusive_cpus) ? cs->cpus_allowed
: cs->exclusive_cpus;
}
static inline bool xcpus_empty(struct cpuset *cs)
{
return cpumask_empty(cs->cpus_allowed) &&
cpumask_empty(cs->exclusive_cpus);
}
static inline bool cpusets_are_exclusive(struct cpuset *cs1, struct cpuset *cs2)
{
struct cpumask *xcpus1 = user_xcpus(cs1);
struct cpumask *xcpus2 = user_xcpus(cs2);
if (cpumask_intersects(xcpus1, xcpus2))
return false;
return true;
}
static inline bool cpus_excl_conflict(struct cpuset *trial, struct cpuset *sibling,
bool xcpus_changed)
{
if (!cpuset_v2())
return cpuset1_cpus_excl_conflict(trial, sibling);
if (xcpus_changed && !cpumask_empty(sibling->cpus_allowed) &&
cpumask_subset(sibling->cpus_allowed, trial->exclusive_cpus))
return true;
return cpumask_intersects(trial->exclusive_cpus, sibling->exclusive_cpus);
}
static inline bool mems_excl_conflict(struct cpuset *cs1, struct cpuset *cs2)
{
if ((is_mem_exclusive(cs1) || is_mem_exclusive(cs2)))
return nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
return false;
}
static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
bool xcpus_changed;
int ret = 0;
rcu_read_lock();
if (!is_in_v2_mode())
ret = cpuset1_validate_change(cur, trial);
if (ret)
goto out;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
ret = -EBUSY;
if (is_cpu_exclusive(cur) && is_sched_load_balance(cur) &&
!cpuset_cpumask_can_shrink(cur->effective_cpus, user_xcpus(trial)))
goto out;
ret = -EINVAL;
xcpus_changed = !cpumask_equal(cur->exclusive_cpus, trial->exclusive_cpus);
cpuset_for_each_child(c, css, par) {
if (c == cur)
continue;
if (cpus_excl_conflict(trial, c, xcpus_changed))
goto out;
if (mems_excl_conflict(trial, c))
goto out;
}
ret = 0;
out:
rcu_read_unlock();
return ret;
}
#ifdef CONFIG_SMP
static int generate_sched_domains(cpumask_var_t **domains,
struct sched_domain_attr **attributes)
{
struct cpuset *cp;
struct cpuset **csa;
int i, j;
cpumask_var_t *doms;
struct sched_domain_attr *dattr;
int ndoms = 0;
struct cgroup_subsys_state *pos_css;
if (!cpuset_v2())
return cpuset1_generate_sched_domains(domains, attributes);
doms = NULL;
dattr = NULL;
csa = NULL;
if (cpumask_empty(subpartitions_cpus)) {
ndoms = 1;
goto generate_doms;
}
csa = kmalloc_objs(cp, nr_cpusets());
if (!csa)
goto done;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) {
if ((cp->partition_root_state == PRS_ROOT) &&
!cpumask_empty(cp->effective_cpus))
csa[ndoms++] = cp;
if (!is_partition_valid(cp) && cpumask_empty(cp->exclusive_cpus))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
for (i = 0; i < ndoms; i++) {
for (j = i + 1; j < ndoms; j++) {
if (cpusets_overlap(csa[i], csa[j]))
WARN_ON_ONCE(1);
}
}
generate_doms:
doms = alloc_sched_domains(ndoms);
if (!doms)
goto done;
dattr = kmalloc_objs(struct sched_domain_attr, ndoms);
for (i = 0; i < ndoms; i++) {
if (!csa || csa[i] == &top_cpuset)
cpumask_and(doms[i], top_cpuset.effective_cpus,
housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
else
cpumask_copy(doms[i], csa[i]->effective_cpus);
if (dattr)
dattr[i] = SD_ATTR_INIT;
}
done:
kfree(csa);
if (doms == NULL)
ndoms = 1;
*domains = doms;
*attributes = dattr;
return ndoms;
}
static void dl_update_tasks_root_domain(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
if (cs->nr_deadline_tasks == 0)
return;
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
dl_add_task_root_domain(task);
css_task_iter_end(&it);
}
void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
int cpu;
u64 cookie = ++dl_cookie;
lockdep_assert_cpuset_lock_held();
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
rcu_read_lock();
for_each_possible_cpu(cpu) {
if (dl_bw_visited(cpu, cookie))
continue;
dl_clear_root_domain_cpu(cpu);
}
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (cpumask_empty(cs->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
css_get(&cs->css);
rcu_read_unlock();
dl_update_tasks_root_domain(cs);
rcu_read_lock();
css_put(&cs->css);
}
rcu_read_unlock();
}
void rebuild_sched_domains_locked(void)
{
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
int i;
lockdep_assert_cpus_held();
lockdep_assert_cpuset_lock_held();
force_sd_rebuild = false;
ndoms = generate_sched_domains(&doms, &attr);
for (i = 0; doms && i < ndoms; i++) {
if (WARN_ON_ONCE(!cpumask_subset(doms[i], cpu_active_mask)))
return;
}
partition_sched_domains(ndoms, doms, attr);
}
#else
void rebuild_sched_domains_locked(void)
{
}
#endif
static void rebuild_sched_domains_cpuslocked(void)
{
mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
mutex_unlock(&cpuset_mutex);
}
void rebuild_sched_domains(void)
{
cpus_read_lock();
rebuild_sched_domains_cpuslocked();
cpus_read_unlock();
}
void cpuset_reset_sched_domains(void)
{
mutex_lock(&cpuset_mutex);
partition_sched_domains(1, NULL, NULL);
mutex_unlock(&cpuset_mutex);
}
void cpuset_update_tasks_cpumask(struct cpuset *cs, struct cpumask *new_cpus)
{
struct css_task_iter it;
struct task_struct *task;
bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
const struct cpumask *possible_mask = task_cpu_possible_mask(task);
if (top_cs) {
if (task->flags & (PF_KTHREAD | PF_NO_SETAFFINITY))
continue;
cpumask_andnot(new_cpus, possible_mask, subpartitions_cpus);
} else {
cpumask_and(new_cpus, possible_mask, cs->effective_cpus);
}
set_cpus_allowed_ptr(task, new_cpus);
}
css_task_iter_end(&it);
}
static void compute_effective_cpumask(struct cpumask *new_cpus,
struct cpuset *cs, struct cpuset *parent)
{
cpumask_and(new_cpus, cs->cpus_allowed, parent->effective_cpus);
}
enum partition_cmd {
partcmd_enable,
partcmd_enablei,
partcmd_disable,
partcmd_update,
partcmd_invalidate,
};
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct tmpmasks *tmp);
static int update_partition_exclusive_flag(struct cpuset *cs, int new_prs)
{
bool exclusive = (new_prs > PRS_MEMBER);
if (exclusive && !is_cpu_exclusive(cs)) {
if (cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 1))
return PERR_NOTEXCL;
} else if (!exclusive && is_cpu_exclusive(cs)) {
cpuset_update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
return 0;
}
static void update_partition_sd_lb(struct cpuset *cs, int old_prs)
{
int new_prs = cs->partition_root_state;
bool rebuild_domains = (new_prs > 0) || (old_prs > 0);
bool new_lb;
if (new_prs > 0) {
new_lb = (new_prs != PRS_ISOLATED);
} else {
new_lb = is_sched_load_balance(parent_cs(cs));
}
if (new_lb != !!is_sched_load_balance(cs)) {
rebuild_domains = true;
if (new_lb)
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
else
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
}
if (rebuild_domains)
cpuset_force_rebuild();
}
static bool tasks_nocpu_error(struct cpuset *parent, struct cpuset *cs,
struct cpumask *xcpus)
{
return (cpumask_subset(parent->effective_cpus, xcpus) &&
partition_is_populated(parent, cs)) ||
(!cpumask_intersects(xcpus, cpu_active_mask) &&
partition_is_populated(cs, NULL));
}
static void reset_partition_data(struct cpuset *cs)
{
struct cpuset *parent = parent_cs(cs);
if (!cpuset_v2())
return;
lockdep_assert_held(&callback_lock);
if (cpumask_empty(cs->exclusive_cpus)) {
cpumask_clear(cs->effective_xcpus);
if (is_cpu_exclusive(cs))
clear_bit(CS_CPU_EXCLUSIVE, &cs->flags);
}
if (!cpumask_and(cs->effective_cpus, parent->effective_cpus, cs->cpus_allowed))
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
}
static void isolated_cpus_update(int old_prs, int new_prs, struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs == new_prs);
lockdep_assert_held(&callback_lock);
lockdep_assert_held(&cpuset_mutex);
if (new_prs == PRS_ISOLATED) {
if (cpumask_subset(xcpus, isolated_cpus))
return;
cpumask_or(isolated_cpus, isolated_cpus, xcpus);
} else {
if (!cpumask_intersects(xcpus, isolated_cpus))
return;
cpumask_andnot(isolated_cpus, isolated_cpus, xcpus);
}
update_housekeeping = true;
}
static void partition_xcpus_add(int new_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
WARN_ON_ONCE(new_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
parent = &top_cpuset;
if (parent == &top_cpuset)
cpumask_or(subpartitions_cpus, subpartitions_cpus, xcpus);
if (new_prs != parent->partition_root_state)
isolated_cpus_update(parent->partition_root_state, new_prs,
xcpus);
cpumask_andnot(parent->effective_cpus, parent->effective_cpus, xcpus);
}
static void partition_xcpus_del(int old_prs, struct cpuset *parent,
struct cpumask *xcpus)
{
WARN_ON_ONCE(old_prs < 0);
lockdep_assert_held(&callback_lock);
if (!parent)
parent = &top_cpuset;
if (parent == &top_cpuset)
cpumask_andnot(subpartitions_cpus, subpartitions_cpus, xcpus);
if (old_prs != parent->partition_root_state)
isolated_cpus_update(old_prs, parent->partition_root_state,
xcpus);
cpumask_or(parent->effective_cpus, parent->effective_cpus, xcpus);
cpumask_and(parent->effective_cpus, parent->effective_cpus, cpu_active_mask);
}
static bool isolated_cpus_can_update(struct cpumask *add_cpus,
struct cpumask *del_cpus)
{
cpumask_var_t full_hk_cpus;
int res = true;
if (!housekeeping_enabled(HK_TYPE_KERNEL_NOISE))
return true;
if (del_cpus && cpumask_weight_and(del_cpus,
housekeeping_cpumask(HK_TYPE_KERNEL_NOISE)))
return true;
if (!alloc_cpumask_var(&full_hk_cpus, GFP_KERNEL))
return false;
cpumask_and(full_hk_cpus, housekeeping_cpumask(HK_TYPE_KERNEL_NOISE),
housekeeping_cpumask(HK_TYPE_DOMAIN));
cpumask_andnot(full_hk_cpus, full_hk_cpus, isolated_cpus);
cpumask_and(full_hk_cpus, full_hk_cpus, cpu_active_mask);
if (!cpumask_weight_andnot(full_hk_cpus, add_cpus))
res = false;
free_cpumask_var(full_hk_cpus);
return res;
}
static bool prstate_housekeeping_conflict(int prstate, struct cpumask *new_cpus)
{
if (!housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
return false;
if ((prstate != PRS_ISOLATED) &&
!cpumask_subset(new_cpus, housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT)))
return true;
return false;
}
static void cpuset_update_sd_hk_unlock(void)
__releases(&cpuset_mutex)
__releases(&cpuset_top_mutex)
{
if (force_sd_rebuild)
rebuild_sched_domains_locked();
if (update_housekeeping) {
update_housekeeping = false;
cpumask_copy(isolated_hk_cpus, isolated_cpus);
mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
WARN_ON_ONCE(housekeeping_update(isolated_hk_cpus));
mutex_unlock(&cpuset_top_mutex);
} else {
cpuset_full_unlock();
}
}
static void hk_sd_workfn(struct work_struct *work)
{
cpuset_full_lock();
cpuset_update_sd_hk_unlock();
}
static int rm_siblings_excl_cpus(struct cpuset *parent, struct cpuset *cs,
struct cpumask *excpus)
{
struct cgroup_subsys_state *css;
struct cpuset *sibling;
int retval = 0;
if (cpumask_empty(excpus))
return 0;
rcu_read_lock();
cpuset_for_each_child(sibling, css, parent) {
struct cpumask *sibling_xcpus;
if (sibling == cs)
continue;
sibling_xcpus = cpumask_empty(sibling->exclusive_cpus)
? sibling->effective_xcpus
: sibling->exclusive_cpus;
if (cpumask_intersects(excpus, sibling_xcpus)) {
cpumask_andnot(excpus, excpus, sibling_xcpus);
retval++;
}
}
rcu_read_unlock();
return retval;
}
static int compute_excpus(struct cpuset *cs, struct cpumask *excpus)
{
struct cpuset *parent = parent_cs(cs);
cpumask_and(excpus, user_xcpus(cs), parent->effective_xcpus);
if (!cpumask_empty(cs->exclusive_cpus))
return 0;
return rm_siblings_excl_cpus(parent, cs, excpus);
}
static int compute_trialcs_excpus(struct cpuset *trialcs, struct cpuset *cs)
{
struct cpuset *parent = parent_cs(trialcs);
struct cpumask *excpus = trialcs->effective_xcpus;
if (cs_is_member(cs))
cpumask_and(excpus, trialcs->exclusive_cpus,
parent->effective_xcpus);
else
cpumask_and(excpus, user_xcpus(trialcs), parent->effective_xcpus);
return rm_siblings_excl_cpus(parent, cs, excpus);
}
static inline bool is_remote_partition(struct cpuset *cs)
{
return cs->remote_partition;
}
static inline bool is_local_partition(struct cpuset *cs)
{
return is_partition_valid(cs) && !is_remote_partition(cs);
}
static int remote_partition_enable(struct cpuset *cs, int new_prs,
struct tmpmasks *tmp)
{
if (!capable(CAP_SYS_ADMIN))
return PERR_ACCESS;
compute_excpus(cs, tmp->new_cpus);
WARN_ON_ONCE(cpumask_intersects(tmp->new_cpus, subpartitions_cpus));
if (!cpumask_intersects(tmp->new_cpus, cpu_active_mask) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->new_cpus))
return PERR_INVCPUS;
if (((new_prs == PRS_ISOLATED) &&
!isolated_cpus_can_update(tmp->new_cpus, NULL)) ||
prstate_housekeeping_conflict(new_prs, tmp->new_cpus))
return PERR_HKEEPING;
spin_lock_irq(&callback_lock);
partition_xcpus_add(new_prs, NULL, tmp->new_cpus);
cs->remote_partition = true;
cpumask_copy(cs->effective_xcpus, tmp->new_cpus);
spin_unlock_irq(&callback_lock);
cpuset_force_rebuild();
cs->prs_err = 0;
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
return 0;
}
static void remote_partition_disable(struct cpuset *cs, struct tmpmasks *tmp)
{
WARN_ON_ONCE(!is_remote_partition(cs));
WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus) &&
!cpumask_empty(subpartitions_cpus));
spin_lock_irq(&callback_lock);
cs->remote_partition = false;
partition_xcpus_del(cs->partition_root_state, NULL, cs->effective_xcpus);
if (cs->prs_err)
cs->partition_root_state = -cs->partition_root_state;
else
cs->partition_root_state = PRS_MEMBER;
compute_excpus(cs, cs->effective_xcpus);
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
cpuset_force_rebuild();
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
}
static void remote_cpus_update(struct cpuset *cs, struct cpumask *xcpus,
struct cpumask *excpus, struct tmpmasks *tmp)
{
bool adding, deleting;
int prs = cs->partition_root_state;
if (WARN_ON_ONCE(!is_remote_partition(cs)))
return;
WARN_ON_ONCE(!cpumask_subset(cs->effective_xcpus, subpartitions_cpus));
if (cpumask_empty(excpus)) {
cs->prs_err = PERR_CPUSEMPTY;
goto invalidate;
}
adding = cpumask_andnot(tmp->addmask, excpus, cs->effective_xcpus);
deleting = cpumask_andnot(tmp->delmask, cs->effective_xcpus, excpus);
if (adding) {
WARN_ON_ONCE(cpumask_intersects(tmp->addmask, subpartitions_cpus));
if (!capable(CAP_SYS_ADMIN))
cs->prs_err = PERR_ACCESS;
else if (cpumask_intersects(tmp->addmask, subpartitions_cpus) ||
cpumask_subset(top_cpuset.effective_cpus, tmp->addmask))
cs->prs_err = PERR_NOCPUS;
else if ((prs == PRS_ISOLATED) &&
!isolated_cpus_can_update(tmp->addmask, tmp->delmask))
cs->prs_err = PERR_HKEEPING;
if (cs->prs_err)
goto invalidate;
}
spin_lock_irq(&callback_lock);
if (adding)
partition_xcpus_add(prs, NULL, tmp->addmask);
if (deleting)
partition_xcpus_del(prs, NULL, tmp->delmask);
cpumask_copy(cs->effective_xcpus, excpus);
if (xcpus)
cpumask_copy(cs->exclusive_cpus, xcpus);
spin_unlock_irq(&callback_lock);
if (adding || deleting)
cpuset_force_rebuild();
cpuset_update_tasks_cpumask(&top_cpuset, tmp->new_cpus);
update_sibling_cpumasks(&top_cpuset, NULL, tmp);
return;
invalidate:
remote_partition_disable(cs, tmp);
}
static int update_parent_effective_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
struct cpuset *parent = parent_cs(cs);
int adding;
int deleting;
int old_prs, new_prs;
int part_error = PERR_NONE;
struct cpumask *xcpus = user_xcpus(cs);
int parent_prs = parent->partition_root_state;
bool nocpu;
lockdep_assert_cpuset_lock_held();
WARN_ON_ONCE(is_remote_partition(cs));
adding = deleting = false;
old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_invalidate) {
if (is_partition_invalid(cs))
return 0;
if (is_partition_valid(parent))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
if (old_prs > 0)
new_prs = -old_prs;
goto write_error;
}
if (!is_partition_valid(parent)) {
return is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART;
}
if (!newmask && xcpus_empty(cs))
return PERR_CPUSEMPTY;
nocpu = tasks_nocpu_error(parent, cs, xcpus);
if ((cmd == partcmd_enable) || (cmd == partcmd_enablei)) {
xcpus = tmp->delmask;
if (compute_excpus(cs, xcpus))
WARN_ON_ONCE(!cpumask_empty(cs->exclusive_cpus));
new_prs = (cmd == partcmd_enable) ? PRS_ROOT : PRS_ISOLATED;
if (cpumask_empty(xcpus))
return PERR_INVCPUS;
if (prstate_housekeeping_conflict(new_prs, xcpus))
return PERR_HKEEPING;
if ((new_prs == PRS_ISOLATED) && (new_prs != parent_prs) &&
!isolated_cpus_can_update(xcpus, NULL))
return PERR_HKEEPING;
if (tasks_nocpu_error(parent, cs, xcpus))
return PERR_NOCPUS;
cpumask_and(tmp->new_cpus, xcpus, cpu_active_mask);
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
deleting = true;
} else if (cmd == partcmd_disable) {
if (is_partition_valid(cs)) {
cpumask_copy(tmp->addmask, cs->effective_xcpus);
adding = true;
}
new_prs = PRS_MEMBER;
} else if (newmask) {
if (cpumask_empty(newmask)) {
part_error = PERR_CPUSEMPTY;
goto write_error;
}
nocpu |= tasks_nocpu_error(parent, cs, newmask);
if (is_partition_invalid(cs)) {
adding = false;
deleting = cpumask_and(tmp->delmask,
newmask, parent->effective_xcpus);
} else {
cpumask_andnot(tmp->addmask, xcpus, newmask);
adding = cpumask_and(tmp->addmask, tmp->addmask,
parent->effective_xcpus);
cpumask_andnot(tmp->delmask, newmask, xcpus);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->effective_xcpus);
}
if (is_partition_valid(cs) && (old_prs != parent_prs)) {
if ((parent_prs == PRS_ROOT) &&
!isolated_cpus_can_update(tmp->delmask, tmp->addmask))
part_error = PERR_HKEEPING;
if ((parent_prs == PRS_ISOLATED) &&
!isolated_cpus_can_update(tmp->addmask, tmp->delmask))
part_error = PERR_HKEEPING;
}
if (deleting) {
cpumask_and(tmp->new_cpus, tmp->delmask, cpu_active_mask);
WARN_ON_ONCE(!cpumask_subset(tmp->new_cpus, parent->effective_cpus));
}
if (nocpu && (!adding ||
!cpumask_intersects(tmp->addmask, cpu_active_mask))) {
part_error = PERR_NOCPUS;
deleting = false;
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
}
} else {
if (nocpu) {
part_error = PERR_NOCPUS;
if (is_partition_valid(cs))
adding = cpumask_and(tmp->addmask,
xcpus, parent->effective_xcpus);
} else if (is_partition_invalid(cs) && !cpumask_empty(xcpus) &&
cpumask_subset(xcpus, parent->effective_xcpus)) {
struct cgroup_subsys_state *css;
struct cpuset *child;
bool exclusive = true;
rcu_read_lock();
cpuset_for_each_child(child, css, parent) {
if (child == cs)
continue;
if (!cpusets_are_exclusive(cs, child)) {
exclusive = false;
break;
}
}
rcu_read_unlock();
if (exclusive)
deleting = cpumask_and(tmp->delmask,
xcpus, parent->effective_cpus);
else
part_error = PERR_NOTEXCL;
}
}
write_error:
if (part_error)
WRITE_ONCE(cs->prs_err, part_error);
if (cmd == partcmd_update) {
switch (cs->partition_root_state) {
case PRS_ROOT:
case PRS_ISOLATED:
if (part_error)
new_prs = -old_prs;
break;
case PRS_INVALID_ROOT:
case PRS_INVALID_ISOLATED:
if (!part_error)
new_prs = -old_prs;
break;
}
}
if (!adding && !deleting && (new_prs == old_prs))
return 0;
if ((old_prs != new_prs) && (cmd != partcmd_update)) {
int err = update_partition_exclusive_flag(cs, new_prs);
if (err)
return err;
}
spin_lock_irq(&callback_lock);
if (old_prs != new_prs)
cs->partition_root_state = new_prs;
if (adding)
partition_xcpus_del(old_prs, parent, tmp->addmask);
if (deleting)
partition_xcpus_add(new_prs, parent, tmp->delmask);
spin_unlock_irq(&callback_lock);
if ((old_prs != new_prs) && (cmd == partcmd_update))
update_partition_exclusive_flag(cs, new_prs);
if (adding || deleting) {
cpuset_update_tasks_cpumask(parent, tmp->addmask);
update_sibling_cpumasks(parent, cs, tmp);
}
if ((cmd == partcmd_update) && !newmask)
update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
return 0;
}
static void compute_partition_effective_cpumask(struct cpuset *cs,
struct cpumask *new_ecpus)
{
struct cgroup_subsys_state *css;
struct cpuset *child;
bool populated = partition_is_populated(cs, NULL);
compute_excpus(cs, new_ecpus);
cpumask_and(new_ecpus, new_ecpus, cpu_active_mask);
rcu_read_lock();
cpuset_for_each_child(child, css, cs) {
if (!is_partition_valid(child))
continue;
WARN_ON_ONCE(is_remote_partition(child));
child->prs_err = 0;
if (!cpumask_subset(child->effective_xcpus,
cs->effective_xcpus))
child->prs_err = PERR_INVCPUS;
else if (populated &&
cpumask_subset(new_ecpus, child->effective_xcpus))
child->prs_err = PERR_NOCPUS;
if (child->prs_err) {
int old_prs = child->partition_root_state;
spin_lock_irq(&callback_lock);
make_partition_invalid(child);
spin_unlock_irq(&callback_lock);
notify_partition_change(child, old_prs);
continue;
}
cpumask_andnot(new_ecpus, new_ecpus,
child->effective_xcpus);
}
rcu_read_unlock();
}
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
bool remote = is_remote_partition(cp);
bool update_parent = false;
old_prs = new_prs = cp->partition_root_state;
if (remote && (cp != cs)) {
compute_excpus(cp, tmp->new_cpus);
if (cpumask_equal(cp->effective_xcpus, tmp->new_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
rcu_read_unlock();
remote_cpus_update(cp, NULL, tmp->new_cpus, tmp);
rcu_read_lock();
new_prs = cp->partition_root_state;
remote = (new_prs == old_prs);
}
if (remote || (is_partition_valid(parent) && is_partition_valid(cp)))
compute_partition_effective_cpumask(cp, tmp->new_cpus);
else
compute_effective_cpumask(tmp->new_cpus, cp, parent);
if (remote)
goto get_css;
if (is_partition_valid(cp) && cpumask_empty(tmp->new_cpus)) {
update_parent = true;
goto update_parent_effective;
}
if (is_in_v2_mode() && !remote && cpumask_empty(tmp->new_cpus))
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus) &&
(!cpuset_v2() ||
(is_sched_load_balance(parent) == is_sched_load_balance(cp)))) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
update_parent_effective:
if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
case PRS_ROOT:
case PRS_ISOLATED:
update_parent = true;
break;
default:
if (is_partition_valid(cp))
new_prs = -cp->partition_root_state;
WRITE_ONCE(cp->prs_err,
is_partition_invalid(parent)
? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
get_css:
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
if (update_parent) {
update_parent_effective_cpumask(cp, partcmd_update, NULL, tmp);
new_prs = cp->partition_root_state;
}
spin_lock_irq(&callback_lock);
cpumask_copy(cp->effective_cpus, tmp->new_cpus);
cp->partition_root_state = new_prs;
if ((new_prs > 0) || !cpumask_empty(cp->exclusive_cpus))
compute_excpus(cp, cp->effective_xcpus);
if (new_prs <= 0)
reset_partition_data(cp);
spin_unlock_irq(&callback_lock);
notify_partition_change(cp, old_prs);
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
cpuset_update_tasks_cpumask(cp, tmp->new_cpus);
if (cpuset_v2() && !is_partition_valid(cp) &&
(is_sched_load_balance(parent) != is_sched_load_balance(cp))) {
if (is_sched_load_balance(parent))
set_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
else
clear_bit(CS_SCHED_LOAD_BALANCE, &cp->flags);
}
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cpuset_v2() || is_partition_valid(cp)))
cpuset_force_rebuild();
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
}
static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct tmpmasks *tmp)
{
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
lockdep_assert_cpuset_lock_held();
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
if (sibling == cs || is_partition_valid(sibling))
continue;
compute_effective_cpumask(tmp->new_cpus, sibling,
parent);
if (cpumask_equal(tmp->new_cpus, sibling->effective_cpus))
continue;
if (!css_tryget_online(&sibling->css))
continue;
rcu_read_unlock();
update_cpumasks_hier(sibling, tmp, false);
rcu_read_lock();
css_put(&sibling->css);
}
rcu_read_unlock();
}
static int parse_cpuset_cpulist(const char *buf, struct cpumask *out_mask)
{
int retval;
retval = cpulist_parse(buf, out_mask);
if (retval < 0)
return retval;
if (!cpumask_subset(out_mask, top_cpuset.cpus_allowed))
return -EINVAL;
return 0;
}
static enum prs_errcode validate_partition(struct cpuset *cs, struct cpuset *trialcs)
{
struct cpuset *parent = parent_cs(cs);
if (cs_is_member(trialcs))
return PERR_NONE;
if (cpumask_empty(trialcs->effective_xcpus))
return PERR_INVCPUS;
if (prstate_housekeeping_conflict(trialcs->partition_root_state,
trialcs->effective_xcpus))
return PERR_HKEEPING;
if (tasks_nocpu_error(parent, cs, trialcs->effective_xcpus))
return PERR_NOCPUS;
return PERR_NONE;
}
static void partition_cpus_change(struct cpuset *cs, struct cpuset *trialcs,
struct tmpmasks *tmp)
{
enum prs_errcode prs_err;
if (cs_is_member(cs))
return;
prs_err = validate_partition(cs, trialcs);
if (prs_err)
trialcs->prs_err = cs->prs_err = prs_err;
if (is_remote_partition(cs)) {
if (trialcs->prs_err)
remote_partition_disable(cs, tmp);
else
remote_cpus_update(cs, trialcs->exclusive_cpus,
trialcs->effective_xcpus, tmp);
} else {
if (trialcs->prs_err)
update_parent_effective_cpumask(cs, partcmd_invalidate,
NULL, tmp);
else
update_parent_effective_cpumask(cs, partcmd_update,
trialcs->effective_xcpus, tmp);
}
}
static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
struct tmpmasks tmp;
bool force = false;
int old_prs = cs->partition_root_state;
retval = parse_cpuset_cpulist(buf, trialcs->cpus_allowed);
if (retval < 0)
return retval;
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
compute_trialcs_excpus(trialcs, cs);
trialcs->prs_err = PERR_NONE;
retval = validate_change(cs, trialcs);
if (retval < 0)
return retval;
if (alloc_tmpmasks(&tmp))
return -ENOMEM;
force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
if ((old_prs > 0) && !is_partition_valid(cs))
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
update_cpumasks_hier(cs, &tmp, force);
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
free_tmpmasks(&tmp);
return retval;
}
static int update_exclusive_cpumask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
struct tmpmasks tmp;
bool force = false;
int old_prs = cs->partition_root_state;
retval = parse_cpuset_cpulist(buf, trialcs->exclusive_cpus);
if (retval < 0)
return retval;
if (cpumask_equal(cs->exclusive_cpus, trialcs->exclusive_cpus))
return 0;
if (compute_trialcs_excpus(trialcs, cs))
return -EINVAL;
force = !cpumask_equal(cs->effective_xcpus, trialcs->effective_xcpus);
retval = validate_change(cs, trialcs);
if (retval)
return retval;
if (alloc_tmpmasks(&tmp))
return -ENOMEM;
trialcs->prs_err = PERR_NONE;
partition_cpus_change(cs, trialcs, &tmp);
spin_lock_irq(&callback_lock);
cpumask_copy(cs->exclusive_cpus, trialcs->exclusive_cpus);
cpumask_copy(cs->effective_xcpus, trialcs->effective_xcpus);
if ((old_prs > 0) && !is_partition_valid(cs))
reset_partition_data(cs);
spin_unlock_irq(&callback_lock);
if (is_partition_valid(cs) || force)
update_cpumasks_hier(cs, &tmp, force);
if (cs->partition_root_state)
update_partition_sd_lb(cs, old_prs);
free_tmpmasks(&tmp);
return 0;
}
struct cpuset_migrate_mm_work {
struct work_struct work;
struct mm_struct *mm;
nodemask_t from;
nodemask_t to;
};
static void cpuset_migrate_mm_workfn(struct work_struct *work)
{
struct cpuset_migrate_mm_work *mwork =
container_of(work, struct cpuset_migrate_mm_work, work);
do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
mmput(mwork->mm);
kfree(mwork);
}
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
struct cpuset_migrate_mm_work *mwork;
if (nodes_equal(*from, *to)) {
mmput(mm);
return;
}
mwork = kzalloc_obj(*mwork);
if (mwork) {
mwork->mm = mm;
mwork->from = *from;
mwork->to = *to;
INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
queue_work(cpuset_migrate_mm_wq, &mwork->work);
} else {
mmput(mm);
}
}
static void flush_migrate_mm_task_workfn(struct callback_head *head)
{
flush_workqueue(cpuset_migrate_mm_wq);
kfree(head);
}
static void schedule_flush_migrate_mm(void)
{
struct callback_head *flush_cb;
flush_cb = kzalloc_obj(struct callback_head);
if (!flush_cb)
return;
init_task_work(flush_cb, flush_migrate_mm_task_workfn);
if (task_work_add(current, flush_cb, TWA_RESUME))
kfree(flush_cb);
}
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
task_lock(tsk);
local_irq_disable();
write_seqcount_begin(&tsk->mems_allowed_seq);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
mpol_rebind_task(tsk, newmems);
tsk->mems_allowed = *newmems;
write_seqcount_end(&tsk->mems_allowed_seq);
local_irq_enable();
task_unlock(tsk);
}
static void *cpuset_being_rebound;
void cpuset_update_tasks_nodemask(struct cpuset *cs)
{
static nodemask_t newmems;
struct css_task_iter it;
struct task_struct *task;
cpuset_being_rebound = cs;
guarantee_online_mems(cs, &newmems);
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
cpuset_change_task_nodemask(task, &newmems);
mm = get_task_mm(task);
if (!mm)
continue;
migrate = is_memory_migrate(cs);
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
else
mmput(mm);
}
css_task_iter_end(&it);
cs->old_mems_allowed = newmems;
cpuset_being_rebound = NULL;
}
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
bool has_mems = nodes_and(*new_mems, cp->mems_allowed, parent->effective_mems);
if (is_in_v2_mode() && !has_mems)
*new_mems = parent->effective_mems;
if (nodes_equal(*new_mems, cp->effective_mems)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
if (!css_tryget_online(&cp->css))
continue;
rcu_read_unlock();
spin_lock_irq(&callback_lock);
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
cpuset_update_tasks_nodemask(cp);
rcu_read_lock();
css_put(&cp->css);
}
rcu_read_unlock();
}
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
const char *buf)
{
int retval;
retval = nodelist_parse(buf, trialcs->mems_allowed);
if (retval < 0)
return retval;
if (!nodes_subset(trialcs->mems_allowed,
top_cpuset.mems_allowed))
return -EINVAL;
if (nodes_equal(cs->mems_allowed, trialcs->mems_allowed))
return 0;
retval = validate_change(cs, trialcs);
if (retval < 0)
return retval;
check_insane_mems_config(&trialcs->mems_allowed);
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
update_nodemasks_hier(cs, &trialcs->mems_allowed);
return 0;
}
bool current_cpuset_is_being_rebound(void)
{
bool ret;
rcu_read_lock();
ret = task_cs(current) == cpuset_being_rebound;
rcu_read_unlock();
return ret;
}
int cpuset_update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
int turning_on)
{
struct cpuset *trialcs;
int balance_flag_changed;
int spread_flag_changed;
int err;
trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs)
return -ENOMEM;
if (turning_on)
set_bit(bit, &trialcs->flags);
else
clear_bit(bit, &trialcs->flags);
err = validate_change(cs, trialcs);
if (err < 0)
goto out;
balance_flag_changed = (is_sched_load_balance(cs) !=
is_sched_load_balance(trialcs));
spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs))
|| (is_spread_page(cs) != is_spread_page(trialcs)));
spin_lock_irq(&callback_lock);
cs->flags = trialcs->flags;
spin_unlock_irq(&callback_lock);
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) {
if (cpuset_v2())
cpuset_force_rebuild();
else
rebuild_sched_domains_locked();
}
if (spread_flag_changed)
cpuset1_update_tasks_flags(cs);
out:
free_cpuset(trialcs);
return err;
}
static int update_prstate(struct cpuset *cs, int new_prs)
{
int err = PERR_NONE, old_prs = cs->partition_root_state;
struct cpuset *parent = parent_cs(cs);
struct tmpmasks tmpmask;
bool isolcpus_updated = false;
if (old_prs == new_prs)
return 0;
if (new_prs && is_partition_invalid(cs))
old_prs = PRS_MEMBER;
if (alloc_tmpmasks(&tmpmask))
return -ENOMEM;
err = update_partition_exclusive_flag(cs, new_prs);
if (err)
goto out;
if (!old_prs) {
if (xcpus_empty(cs)) {
err = PERR_CPUSEMPTY;
goto out;
}
if ((parent == &top_cpuset) &&
cpumask_intersects(cs->exclusive_cpus, subpartitions_cpus)) {
err = PERR_REMOTE;
goto out;
}
if (is_partition_valid(parent)) {
enum partition_cmd cmd = (new_prs == PRS_ROOT)
? partcmd_enable : partcmd_enablei;
err = update_parent_effective_cpumask(cs, cmd, NULL, &tmpmask);
} else {
err = remote_partition_enable(cs, new_prs, &tmpmask);
}
} else if (old_prs && new_prs) {
if (((new_prs == PRS_ISOLATED) &&
!isolated_cpus_can_update(cs->effective_xcpus, NULL)) ||
prstate_housekeeping_conflict(new_prs, cs->effective_xcpus))
err = PERR_HKEEPING;
else
isolcpus_updated = true;
} else {
if (is_remote_partition(cs))
remote_partition_disable(cs, &tmpmask);
else
update_parent_effective_cpumask(cs, partcmd_disable,
NULL, &tmpmask);
}
out:
if (err) {
new_prs = -new_prs;
update_partition_exclusive_flag(cs, new_prs);
}
spin_lock_irq(&callback_lock);
cs->partition_root_state = new_prs;
WRITE_ONCE(cs->prs_err, err);
if (!is_partition_valid(cs))
reset_partition_data(cs);
else if (isolcpus_updated)
isolated_cpus_update(old_prs, new_prs, cs->effective_xcpus);
spin_unlock_irq(&callback_lock);
update_cpumasks_hier(cs, &tmpmask, !new_prs);
WARN_ON_ONCE(!old_prs && (new_prs > 0)
&& cpumask_empty(cs->effective_xcpus));
update_partition_sd_lb(cs, old_prs);
notify_partition_change(cs, old_prs);
if (force_sd_rebuild)
rebuild_sched_domains_locked();
free_tmpmasks(&tmpmask);
return 0;
}
static struct cpuset *cpuset_attach_old_cs;
static int cpuset_can_attach_check(struct cpuset *cs)
{
if (cpumask_empty(cs->effective_cpus) ||
(!is_in_v2_mode() && nodes_empty(cs->mems_allowed)))
return -ENOSPC;
return 0;
}
static void reset_migrate_dl_data(struct cpuset *cs)
{
cs->nr_migrate_dl_tasks = 0;
cs->sum_migrate_dl_bw = 0;
}
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct cpuset *cs, *oldcs;
struct task_struct *task;
bool setsched_check;
int ret;
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
oldcs = cpuset_attach_old_cs;
cs = css_cs(css);
mutex_lock(&cpuset_mutex);
ret = cpuset_can_attach_check(cs);
if (ret)
goto out_unlock;
setsched_check = !cpuset_v2() ||
!cpumask_equal(cs->effective_cpus, oldcs->effective_cpus) ||
!nodes_equal(cs->effective_mems, oldcs->effective_mems);
if (!is_in_v2_mode() && cpumask_empty(oldcs->effective_cpus))
setsched_check = false;
cgroup_taskset_for_each(task, css, tset) {
ret = task_can_attach(task);
if (ret)
goto out_unlock;
if (setsched_check) {
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
}
if (dl_task(task)) {
cs->nr_migrate_dl_tasks++;
cs->sum_migrate_dl_bw += task->dl.dl_bw;
}
}
if (!cs->nr_migrate_dl_tasks)
goto out_success;
if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
if (unlikely(cpu >= nr_cpu_ids)) {
reset_migrate_dl_data(cs);
ret = -EINVAL;
goto out_unlock;
}
ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
if (ret) {
reset_migrate_dl_data(cs);
goto out_unlock;
}
}
out_success:
cs->attach_in_progress++;
out_unlock:
mutex_unlock(&cpuset_mutex);
return ret;
}
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
mutex_lock(&cpuset_mutex);
dec_attach_in_progress_locked(cs);
if (cs->nr_migrate_dl_tasks) {
int cpu = cpumask_any(cs->effective_cpus);
dl_bw_free(cpu, cs->sum_migrate_dl_bw);
reset_migrate_dl_data(cs);
}
mutex_unlock(&cpuset_mutex);
}
static cpumask_var_t cpus_attach;
static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
lockdep_assert_cpuset_lock_held();
if (cs != &top_cpuset)
guarantee_active_cpus(task, cpus_attach);
else
cpumask_andnot(cpus_attach, task_cpu_possible_mask(task),
subpartitions_cpus);
WARN_ON_ONCE(set_cpus_allowed_ptr(task, cpus_attach));
cpuset_change_task_nodemask(task, &cpuset_attach_nodemask_to);
cpuset1_update_task_spread_flags(cs, task);
}
static void cpuset_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct task_struct *leader;
struct cgroup_subsys_state *css;
struct cpuset *cs;
struct cpuset *oldcs = cpuset_attach_old_cs;
bool cpus_updated, mems_updated;
bool queue_task_work = false;
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
lockdep_assert_cpus_held();
mutex_lock(&cpuset_mutex);
cpus_updated = !cpumask_equal(cs->effective_cpus,
oldcs->effective_cpus);
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
if (cpuset_v2() && !cpus_updated && !mems_updated) {
cpuset_attach_nodemask_to = cs->effective_mems;
goto out;
}
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset)
cpuset_attach_task(cs, task);
cpuset_attach_nodemask_to = cs->effective_mems;
if (!is_memory_migrate(cs) && !mems_updated)
goto out;
cgroup_taskset_for_each_leader(leader, css, tset) {
struct mm_struct *mm = get_task_mm(leader);
if (mm) {
mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs)) {
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
queue_task_work = true;
} else
mmput(mm);
}
}
out:
if (queue_task_work)
schedule_flush_migrate_mm();
cs->old_mems_allowed = cpuset_attach_nodemask_to;
if (cs->nr_migrate_dl_tasks) {
cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
reset_migrate_dl_data(cs);
}
dec_attach_in_progress_locked(cs);
mutex_unlock(&cpuset_mutex);
}
ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
struct cpuset *trialcs;
int retval = -ENODEV;
if (cs == &top_cpuset)
return -EACCES;
buf = strstrip(buf);
cpuset_full_lock();
if (!is_cpuset_online(cs))
goto out_unlock;
trialcs = dup_or_alloc_cpuset(cs);
if (!trialcs) {
retval = -ENOMEM;
goto out_unlock;
}
switch (of_cft(of)->private) {
case FILE_CPULIST:
retval = update_cpumask(cs, trialcs, buf);
break;
case FILE_EXCLUSIVE_CPULIST:
retval = update_exclusive_cpumask(cs, trialcs, buf);
break;
case FILE_MEMLIST:
retval = update_nodemask(cs, trialcs, buf);
break;
default:
retval = -EINVAL;
break;
}
free_cpuset(trialcs);
out_unlock:
cpuset_update_sd_hk_unlock();
if (of_cft(of)->private == FILE_MEMLIST)
schedule_flush_migrate_mm();
return retval ?: nbytes;
}
int cpuset_common_seq_show(struct seq_file *sf, void *v)
{
struct cpuset *cs = css_cs(seq_css(sf));
cpuset_filetype_t type = seq_cft(sf)->private;
int ret = 0;
spin_lock_irq(&callback_lock);
switch (type) {
case FILE_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
break;
case FILE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
break;
case FILE_EFFECTIVE_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_cpus));
break;
case FILE_EFFECTIVE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->effective_mems));
break;
case FILE_EXCLUSIVE_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->exclusive_cpus));
break;
case FILE_EFFECTIVE_XCPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->effective_xcpus));
break;
case FILE_SUBPARTS_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(subpartitions_cpus));
break;
case FILE_ISOLATED_CPULIST:
seq_printf(sf, "%*pbl\n", cpumask_pr_args(isolated_cpus));
break;
default:
ret = -EINVAL;
}
spin_unlock_irq(&callback_lock);
return ret;
}
static int cpuset_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
const char *err, *type = NULL;
switch (cs->partition_root_state) {
case PRS_ROOT:
seq_puts(seq, "root\n");
break;
case PRS_ISOLATED:
seq_puts(seq, "isolated\n");
break;
case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
case PRS_INVALID_ROOT:
type = "root";
fallthrough;
case PRS_INVALID_ISOLATED:
if (!type)
type = "isolated";
err = perr_strings[READ_ONCE(cs->prs_err)];
if (err)
seq_printf(seq, "%s invalid (%s)\n", type, err);
else
seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
}
static ssize_t cpuset_partition_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct cpuset *cs = css_cs(of_css(of));
int val;
int retval = -ENODEV;
buf = strstrip(buf);
if (!strcmp(buf, "root"))
val = PRS_ROOT;
else if (!strcmp(buf, "member"))
val = PRS_MEMBER;
else if (!strcmp(buf, "isolated"))
val = PRS_ISOLATED;
else
return -EINVAL;
cpuset_full_lock();
if (is_cpuset_online(cs))
retval = update_prstate(cs, val);
cpuset_update_sd_hk_unlock();
return retval ?: nbytes;
}
static struct cftype dfl_files[] = {
{
.name = "cpus",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_CPULIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "mems",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * MAX_NUMNODES),
.private = FILE_MEMLIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "cpus.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_CPULIST,
},
{
.name = "mems.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_MEMLIST,
},
{
.name = "cpus.partition",
.seq_show = cpuset_partition_show,
.write = cpuset_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct cpuset, partition_file),
},
{
.name = "cpus.exclusive",
.seq_show = cpuset_common_seq_show,
.write = cpuset_write_resmask,
.max_write_len = (100U + 6 * NR_CPUS),
.private = FILE_EXCLUSIVE_CPULIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "cpus.exclusive.effective",
.seq_show = cpuset_common_seq_show,
.private = FILE_EFFECTIVE_XCPULIST,
.flags = CFTYPE_NOT_ON_ROOT,
},
{
.name = "cpus.subpartitions",
.seq_show = cpuset_common_seq_show,
.private = FILE_SUBPARTS_CPULIST,
.flags = CFTYPE_ONLY_ON_ROOT | CFTYPE_DEBUG,
},
{
.name = "cpus.isolated",
.seq_show = cpuset_common_seq_show,
.private = FILE_ISOLATED_CPULIST,
.flags = CFTYPE_ONLY_ON_ROOT,
},
{ }
};
static struct cgroup_subsys_state *
cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct cpuset *cs;
if (!parent_css)
return &top_cpuset.css;
cs = dup_or_alloc_cpuset(NULL);
if (!cs)
return ERR_PTR(-ENOMEM);
__set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset1_init(cs);
if (cpuset_v2())
__set_bit(CS_MEMORY_MIGRATE, &cs->flags);
return &cs->css;
}
static int cpuset_css_online(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
struct cpuset *parent = parent_cs(cs);
if (!parent)
return 0;
cpuset_full_lock();
if (cpuset_v2() && !is_sched_load_balance(parent))
clear_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpuset_inc();
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
spin_unlock_irq(&callback_lock);
cpuset1_online_css(css);
cpuset_full_unlock();
return 0;
}
static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
cpuset_full_lock();
if (!cpuset_v2() && is_sched_load_balance(cs))
cpuset_update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
cpuset_dec();
cpuset_full_unlock();
}
static void cpuset_css_killed(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
cpuset_full_lock();
if (is_partition_valid(cs))
update_prstate(cs, PRS_MEMBER);
cpuset_update_sd_hk_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
free_cpuset(cs);
}
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
cpumask_copy(top_cpuset.effective_xcpus, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
cpumask_copy(top_cpuset.cpus_allowed,
top_cpuset.effective_cpus);
top_cpuset.mems_allowed = top_cpuset.effective_mems;
}
spin_unlock_irq(&callback_lock);
mutex_unlock(&cpuset_mutex);
}
static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
bool same_cs;
int ret;
rcu_read_lock();
same_cs = (cs == task_cs(current));
rcu_read_unlock();
if (same_cs)
return 0;
lockdep_assert_held(&cgroup_mutex);
mutex_lock(&cpuset_mutex);
ret = cpuset_can_attach_check(cs);
if (ret)
goto out_unlock;
ret = task_can_attach(task);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
cs->attach_in_progress++;
out_unlock:
mutex_unlock(&cpuset_mutex);
return ret;
}
static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
{
struct cpuset *cs = css_cs(cset->subsys[cpuset_cgrp_id]);
bool same_cs;
rcu_read_lock();
same_cs = (cs == task_cs(current));
rcu_read_unlock();
if (same_cs)
return;
dec_attach_in_progress(cs);
}
static void cpuset_fork(struct task_struct *task)
{
struct cpuset *cs;
bool same_cs;
rcu_read_lock();
cs = task_cs(task);
same_cs = (cs == task_cs(current));
rcu_read_unlock();
if (same_cs) {
if (cs == &top_cpuset)
return;
set_cpus_allowed_ptr(task, current->cpus_ptr);
task->mems_allowed = current->mems_allowed;
return;
}
mutex_lock(&cpuset_mutex);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cpuset_attach_task(cs, task);
dec_attach_in_progress_locked(cs);
mutex_unlock(&cpuset_mutex);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
.css_alloc = cpuset_css_alloc,
.css_online = cpuset_css_online,
.css_offline = cpuset_css_offline,
.css_killed = cpuset_css_killed,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.bind = cpuset_bind,
.can_fork = cpuset_can_fork,
.cancel_fork = cpuset_cancel_fork,
.fork = cpuset_fork,
#ifdef CONFIG_CPUSETS_V1
.legacy_cftypes = cpuset1_files,
#endif
.dfl_cftypes = dfl_files,
.early_init = true,
.threaded = true,
};
int __init cpuset_init(void)
{
BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_xcpus, GFP_KERNEL));
BUG_ON(!alloc_cpumask_var(&top_cpuset.exclusive_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&subpartitions_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&isolated_cpus, GFP_KERNEL));
BUG_ON(!zalloc_cpumask_var(&isolated_hk_cpus, GFP_KERNEL));
cpumask_setall(top_cpuset.cpus_allowed);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
cpumask_setall(top_cpuset.effective_xcpus);
cpumask_setall(top_cpuset.exclusive_cpus);
nodes_setall(top_cpuset.effective_mems);
cpuset1_init(&top_cpuset);
BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL));
if (housekeeping_enabled(HK_TYPE_DOMAIN_BOOT))
cpumask_andnot(isolated_cpus, cpu_possible_mask,
housekeeping_cpumask(HK_TYPE_DOMAIN_BOOT));
return 0;
}
static void
hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
spin_lock_irq(&callback_lock);
cpumask_copy(cs->effective_cpus, new_cpus);
cs->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
if (cpus_updated)
cpuset_update_tasks_cpumask(cs, new_cpus);
if (mems_updated)
cpuset_update_tasks_nodemask(cs);
}
void cpuset_force_rebuild(void)
{
force_sd_rebuild = true;
}
static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
{
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated;
bool mems_updated;
bool remote;
int partcmd = -1;
struct cpuset *parent;
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
mutex_lock(&cpuset_mutex);
if (cs->attach_in_progress) {
mutex_unlock(&cpuset_mutex);
goto retry;
}
parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
if (!tmp || !cs->partition_root_state)
goto update_tasks;
remote = is_remote_partition(cs);
if (remote || (is_partition_valid(cs) && is_partition_valid(parent)))
compute_partition_effective_cpumask(cs, &new_cpus);
if (remote && (cpumask_empty(subpartitions_cpus) ||
(cpumask_empty(&new_cpus) &&
partition_is_populated(cs, NULL)))) {
cs->prs_err = PERR_HOTPLUG;
remote_partition_disable(cs, tmp);
compute_effective_cpumask(&new_cpus, cs, parent);
remote = false;
}
if (is_local_partition(cs) &&
(!is_partition_valid(parent) ||
tasks_nocpu_error(parent, cs, &new_cpus) ||
cpumask_empty(subpartitions_cpus)))
partcmd = partcmd_invalidate;
else if (is_partition_valid(parent) && is_partition_invalid(cs) &&
!cpumask_empty(cs->effective_xcpus))
partcmd = partcmd_update;
if (partcmd >= 0) {
update_parent_effective_cpumask(cs, partcmd, NULL, tmp);
if ((partcmd == partcmd_invalidate) || is_partition_valid(cs)) {
compute_partition_effective_cpumask(cs, &new_cpus);
cpuset_force_rebuild();
}
}
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
if (!cpus_updated && !mems_updated)
goto unlock;
if (mems_updated)
check_insane_mems_config(&new_mems);
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
cpuset1_hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
unlock:
mutex_unlock(&cpuset_mutex);
}
static void cpuset_handle_hotplug(void)
{
static DECLARE_WORK(hk_sd_work, hk_sd_workfn);
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
bool on_dfl = is_in_v2_mode();
struct tmpmasks tmp, *ptmp = NULL;
if (on_dfl && !alloc_tmpmasks(&tmp))
ptmp = &tmp;
lockdep_assert_cpus_held();
mutex_lock(&cpuset_mutex);
cpumask_copy(&new_cpus, cpu_active_mask);
new_mems = node_states[N_MEMORY];
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus) ||
!cpumask_empty(subpartitions_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
if (cpus_updated) {
cpuset_force_rebuild();
spin_lock_irq(&callback_lock);
if (!on_dfl)
cpumask_copy(top_cpuset.cpus_allowed, &new_cpus);
if (!cpumask_empty(subpartitions_cpus)) {
if (cpumask_subset(&new_cpus, subpartitions_cpus)) {
cpumask_clear(subpartitions_cpus);
} else {
cpumask_andnot(&new_cpus, &new_cpus,
subpartitions_cpus);
}
}
cpumask_copy(top_cpuset.effective_cpus, &new_cpus);
spin_unlock_irq(&callback_lock);
}
if (mems_updated) {
spin_lock_irq(&callback_lock);
if (!on_dfl)
top_cpuset.mems_allowed = new_mems;
top_cpuset.effective_mems = new_mems;
spin_unlock_irq(&callback_lock);
cpuset_update_tasks_nodemask(&top_cpuset);
}
mutex_unlock(&cpuset_mutex);
if (cpus_updated || mems_updated) {
struct cpuset *cs;
struct cgroup_subsys_state *pos_css;
rcu_read_lock();
cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
if (cs == &top_cpuset || !css_tryget_online(&cs->css))
continue;
rcu_read_unlock();
cpuset_hotplug_update_tasks(cs, ptmp);
rcu_read_lock();
css_put(&cs->css);
}
rcu_read_unlock();
}
if (force_sd_rebuild)
rebuild_sched_domains_cpuslocked();
if (update_housekeeping)
queue_work(system_dfl_wq, &hk_sd_work);
free_tmpmasks(ptmp);
}
void cpuset_update_active_cpus(void)
{
cpuset_handle_hotplug();
}
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
cpuset_handle_hotplug();
return NOTIFY_OK;
}
void __init cpuset_init_smp(void)
{
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
top_cpuset.effective_mems = node_states[N_MEMORY];
hotplug_node_notifier(cpuset_track_online_nodes, CPUSET_CALLBACK_PRI);
cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
BUG_ON(!cpuset_migrate_mm_wq);
}
static void __cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
struct cpuset *cs;
cs = task_cs(tsk);
if (cs != &top_cpuset)
guarantee_active_cpus(tsk, pmask);
if ((cs == &top_cpuset) || cpumask_empty(pmask)) {
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
cpumask_andnot(pmask, possible_mask, subpartitions_cpus);
if (!cpumask_intersects(pmask, cpu_active_mask))
cpumask_copy(pmask, possible_mask);
}
}
void cpuset_cpus_allowed_locked(struct task_struct *tsk, struct cpumask *pmask)
{
lockdep_assert_cpuset_lock_held();
__cpuset_cpus_allowed_locked(tsk, pmask);
}
void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
{
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
__cpuset_cpus_allowed_locked(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
const struct cpumask *cs_mask;
bool changed = false;
rcu_read_lock();
cs_mask = task_cs(tsk)->cpus_allowed;
if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
set_cpus_allowed_force(tsk, cs_mask);
changed = true;
}
rcu_read_unlock();
return changed;
}
void __init cpuset_init_current_mems_allowed(void)
{
nodes_setall(current->mems_allowed);
}
nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
{
nodemask_t mask;
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
guarantee_online_mems(task_cs(tsk), &mask);
spin_unlock_irqrestore(&callback_lock, flags);
return mask;
}
int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
{
return nodes_intersects(*nodemask, current->mems_allowed);
}
static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
{
while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs))
cs = parent_cs(cs);
return cs;
}
bool cpuset_current_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs;
bool allowed;
unsigned long flags;
if (in_interrupt())
return true;
if (node_isset(node, current->mems_allowed))
return true;
if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL)
return false;
if (current->flags & PF_EXITING)
return true;
spin_lock_irqsave(&callback_lock, flags);
cs = nearest_hardwall_ancestor(task_cs(current));
allowed = node_isset(node, cs->mems_allowed);
spin_unlock_irqrestore(&callback_lock, flags);
return allowed;
}
void cpuset_nodes_allowed(struct cgroup *cgroup, nodemask_t *mask)
{
struct cgroup_subsys_state *css;
struct cpuset *cs;
if (!cgroup || !cpuset_v2()) {
nodes_copy(*mask, node_states[N_MEMORY]);
return;
}
css = cgroup_get_e_css(cgroup, &cpuset_cgrp_subsys);
if (!css) {
nodes_copy(*mask, node_states[N_MEMORY]);
return;
}
cs = container_of(css, struct cpuset, css);
nodes_copy(*mask, cs->effective_mems);
css_put(css);
}
static int cpuset_spread_node(int *rotor)
{
return *rotor = next_node_in(*rotor, current->mems_allowed);
}
int cpuset_mem_spread_node(void)
{
if (current->cpuset_mem_spread_rotor == NUMA_NO_NODE)
current->cpuset_mem_spread_rotor =
node_random(¤t->mems_allowed);
return cpuset_spread_node(¤t->cpuset_mem_spread_rotor);
}
int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
const struct task_struct *tsk2)
{
return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed);
}
void cpuset_print_current_mems_allowed(void)
{
struct cgroup *cgrp;
rcu_read_lock();
cgrp = task_cs(current)->css.cgroup;
pr_cont(",cpuset=");
pr_cont_cgroup_name(cgrp);
pr_cont(",mems_allowed=%*pbl",
nodemask_pr_args(¤t->mems_allowed));
rcu_read_unlock();
}
void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
{
seq_printf(m, "Mems_allowed:\t%*pb\n",
nodemask_pr_args(&task->mems_allowed));
seq_printf(m, "Mems_allowed_list:\t%*pbl\n",
nodemask_pr_args(&task->mems_allowed));
}