aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cpuset.c')
-rw-r--r--kernel/cgroup/cpuset.c127
1 files changed, 66 insertions, 61 deletions
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index ae643412948a..f7efa7b4d825 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -56,13 +56,15 @@
#include <linux/time64.h>
#include <linux/backing-dev.h>
#include <linux/sort.h>
-
+#include <linux/oom.h>
+#include <linux/sched/isolation.h>
#include <linux/uaccess.h>
#include <linux/atomic.h>
#include <linux/mutex.h>
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -299,6 +301,16 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
/*
+ * Cgroup v2 behavior is used when on default hierarchy or the
+ * cgroup_v2_mode flag is set.
+ */
+static inline bool is_in_v2_mode(void)
+{
+ return cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
+ (cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
+}
+
+/*
* This is ugly, but preserves the userspace API for existing cpuset
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
@@ -488,8 +500,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
/* On legacy hiearchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- !is_cpuset_subset(trial, par))
+ if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
@@ -576,6 +587,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
+/* Must be called with cpuset_mutex held. */
+static inline int nr_cpusets(void)
+{
+ /* jump label reference count + the top-level cpuset */
+ return static_key_count(&cpusets_enabled_key.key) + 1;
+}
+
/*
* generate_sched_domains()
*
@@ -638,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
int csn; /* how many cpuset ptrs in csa so far */
int i, j, k; /* indices for partition finding loops */
cpumask_var_t *doms; /* resulting partition; i.e. sched domains */
- cpumask_var_t non_isolated_cpus; /* load balanced CPUs */
struct sched_domain_attr *dattr; /* attributes for custom domains */
int ndoms = 0; /* number of sched domains in result */
int nslot; /* next empty doms[] struct cpumask slot */
@@ -648,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
dattr = NULL;
csa = NULL;
- if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
- goto done;
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
-
/* Special case for the 99% of systems with one, full, sched domain */
if (is_sched_load_balance(&top_cpuset)) {
ndoms = 1;
@@ -665,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- non_isolated_cpus);
+ housekeeping_cpumask(HK_FLAG_DOMAIN));
goto done;
}
@@ -689,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
*/
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
- cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
+ cpumask_intersects(cp->cpus_allowed,
+ housekeeping_cpumask(HK_FLAG_DOMAIN))))
continue;
if (is_sched_load_balance(cp))
@@ -771,7 +785,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, non_isolated_cpus);
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -784,7 +798,6 @@ restart:
BUG_ON(nslot != ndoms);
done:
- free_cpumask_var(non_isolated_cpus);
kfree(csa);
/*
@@ -861,7 +874,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
set_cpus_allowed_ptr(task, cs->effective_cpus);
css_task_iter_end(&it);
@@ -895,8 +908,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some CPUs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- cpumask_empty(new_cpus))
+ if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
@@ -913,7 +925,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
cpumask_copy(cp->effective_cpus, new_cpus);
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
update_tasks_cpumask(cp);
@@ -1038,40 +1050,25 @@ static void cpuset_post_attach(void)
* @tsk: the task to change
* @newmems: new nodes that the task will be set
*
- * In order to avoid seeing no nodes if the old and new nodes are disjoint,
- * we structure updates as setting all new allowed nodes, then clearing newly
- * disallowed ones.
+ * We use the mems_allowed_seq seqlock to safely update both tsk->mems_allowed
+ * and rebind an eventual tasks' mempolicy. If the task is allocating in
+ * parallel, it might temporarily see an empty intersection, which results in
+ * a seqlock check and retry before OOM or allocation failure.
*/
static void cpuset_change_task_nodemask(struct task_struct *tsk,
nodemask_t *newmems)
{
- bool need_loop;
-
task_lock(tsk);
- /*
- * Determine if a loop is necessary if another thread is doing
- * read_mems_allowed_begin(). If at least one node remains unchanged and
- * tsk does not have a mempolicy, then an empty nodemask will not be
- * possible when mems_allowed is larger than a word.
- */
- need_loop = task_has_mempolicy(tsk) ||
- !nodes_intersects(*newmems, tsk->mems_allowed);
- if (need_loop) {
- local_irq_disable();
- write_seqcount_begin(&tsk->mems_allowed_seq);
- }
+ local_irq_disable();
+ write_seqcount_begin(&tsk->mems_allowed_seq);
nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
-
- mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP2);
+ mpol_rebind_task(tsk, newmems);
tsk->mems_allowed = *newmems;
- if (need_loop) {
- write_seqcount_end(&tsk->mems_allowed_seq);
- local_irq_enable();
- }
+ write_seqcount_end(&tsk->mems_allowed_seq);
+ local_irq_enable();
task_unlock(tsk);
}
@@ -1106,7 +1103,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
*/
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it))) {
struct mm_struct *mm;
bool migrate;
@@ -1164,8 +1161,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* If it becomes empty, inherit the effective mask of the
* parent, which is guaranteed to have some MEMs.
*/
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
- nodes_empty(*new_mems))
+ if (is_in_v2_mode() && nodes_empty(*new_mems))
*new_mems = parent->effective_mems;
/* Skip the whole subtree if the nodemask remains the same. */
@@ -1182,7 +1178,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
cp->effective_mems = *new_mems;
spin_unlock_irq(&callback_lock);
- WARN_ON(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ WARN_ON(!is_in_v2_mode() &&
!nodes_equal(cp->mems_allowed, cp->effective_mems));
update_tasks_nodemask(cp);
@@ -1299,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs)
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&cs->css, &it);
+ css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
cpuset_update_task_spread_flag(cs, task);
css_task_iter_end(&it);
@@ -1474,7 +1470,7 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
/* allow moving tasks into an empty cpuset if on default hierarchy */
ret = -ENOSPC;
- if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
+ if (!is_in_v2_mode() &&
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
@@ -1906,6 +1902,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -1992,7 +1989,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpuset_inc();
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(cs->effective_cpus, parent->effective_cpus);
cs->effective_mems = parent->effective_mems;
}
@@ -2069,7 +2066,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ if (is_in_v2_mode()) {
cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask);
top_cpuset.mems_allowed = node_possible_map;
} else {
@@ -2263,7 +2260,7 @@ retry:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
- if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
else
@@ -2273,6 +2270,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2294,7 +2298,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
static cpumask_t new_cpus;
static nodemask_t new_mems;
bool cpus_updated, mems_updated;
- bool on_dfl = cgroup_subsys_on_dfl(cpuset_cgrp_subsys);
+ bool on_dfl = is_in_v2_mode();
mutex_lock(&cpuset_mutex);
@@ -2347,8 +2351,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(void)
@@ -2357,16 +2363,15 @@ void cpuset_update_active_cpus(void)
* We're inside cpu hotplug critical region which usually nests
* inside cgroup synchronization. Bounce actual hotplug processing
* to a work item to avoid reverse locking order.
- *
- * We still need to do partition_sched_domains() synchronously;
- * otherwise, the scheduler will get confused and put tasks to the
- * dead CPU. Fall back to the default single domain.
- * cpuset_hotplug_workfn() will rebuild it as necessary.
*/
- partition_sched_domains(1, NULL, NULL);
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2512,12 +2517,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* If we're in interrupt, yes, we can always allocate. If @node is set in
* current's mems_allowed, yes. If it's not a __GFP_HARDWALL request and this
* node is set in the nearest hardwalled cpuset ancestor to current's cpuset,
- * yes. If current has access to memory reserves due to TIF_MEMDIE, yes.
+ * yes. If current has access to memory reserves as an oom victim, yes.
* Otherwise, no.
*
* GFP_USER allocations are marked with the __GFP_HARDWALL bit,
* and do not allow allocations outside the current tasks cpuset
- * unless the task has been OOM killed as is marked TIF_MEMDIE.
+ * unless the task has been OOM killed.
* GFP_KERNEL allocations are not so marked, so can escape to the
* nearest enclosing hardwalled ancestor cpuset.
*
@@ -2540,7 +2545,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
* affect that:
* in_interrupt - any node ok (current task context irrelevant)
* GFP_ATOMIC - any node ok
- * TIF_MEMDIE - any node ok
+ * tsk_is_oom_victim - any node ok
* GFP_KERNEL - any node in enclosing hardwalled cpuset ok
* GFP_USER - only nodes in current tasks mems allowed ok.
*/
@@ -2558,7 +2563,7 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
* Allow tasks that have access to memory reserves because they have
* been OOM killed to get memory anywhere.
*/
- if (unlikely(test_thread_flag(TIF_MEMDIE)))
+ if (unlikely(tsk_is_oom_victim(current)))
return true;
if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */
return false;