aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-07-01 17:22:14 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2021-07-01 17:22:14 -0700
commit3dbdb38e286903ec220aaf1fb29a8d94297da246 (patch)
treef80cd38836298d6bb465462e30f4dd644b018735 /kernel
parentMerge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu (diff)
parentcgroup: make per-cgroup pressure stall tracking configurable (diff)
downloadlinux-dev-3dbdb38e286903ec220aaf1fb29a8d94297da246.tar.xz
linux-dev-3dbdb38e286903ec220aaf1fb29a8d94297da246.zip
Merge branch 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - cgroup.kill is added which implements atomic killing of the whole subtree. Down the line, this should be able to replace the multiple userland implementations of "keep killing till empty". - PSI can now be turned off at boot time to avoid overhead for configurations which don't care about PSI. * 'for-5.14' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: cgroup: make per-cgroup pressure stall tracking configurable cgroup: Fix kernel-doc cgroup: inline cgroup_task_freeze() tests/cgroup: test cgroup.kill tests/cgroup: move cg_wait_for(), cg_prepare_for_wait() tests/cgroup: use cgroup.kill in cg_killall() docs/cgroup: add entry for cgroup.kill cgroup: introduce cgroup.kill
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c180
-rw-r--r--kernel/cgroup/rstat.c2
-rw-r--r--kernel/sched/psi.c30
3 files changed, 182 insertions, 30 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 9cc8c3a686b1..ea5b0f01d2b3 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -209,6 +209,22 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+ OPT_FEATURE_PRESSURE,
+#endif
+ OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+ "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
+
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
static void css_task_iter_skip(struct css_task_iter *it,
@@ -2390,7 +2406,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -3632,6 +3648,18 @@ static void cgroup_pressure_release(struct kernfs_open_file *of)
{
psi_trigger_replace(&of->priv, NULL);
}
+
+bool cgroup_psi_enabled(void)
+{
+ return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+ return false;
+}
+
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3668,6 +3696,80 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ set_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ /* Skip tasks that are already dying. */
+ if (__fatal_signal_pending(task))
+ continue;
+
+ send_sig(SIGKILL, task, 0);
+ }
+ css_task_iter_end(&it);
+
+ spin_lock_irq(&css_set_lock);
+ clear_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+ __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ ssize_t ret = 0;
+ int kill;
+ struct cgroup *cgrp;
+
+ ret = kstrtoint(strstrip(buf), 0, &kill);
+ if (ret)
+ return ret;
+
+ if (kill != 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /*
+ * Killing is a process directed operation, i.e. the whole thread-group
+ * is taken down so act like we do for cgroup.procs and only make this
+ * writable in non-threaded cgroups.
+ */
+ if (cgroup_is_threaded(cgrp))
+ ret = -EOPNOTSUPP;
+ else
+ cgroup_kill(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of_cft(of);
@@ -3882,6 +3984,8 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
restart:
for (cft = cfts; cft != cft_end && cft->name[0] != '\0'; cft++) {
/* does cft->flags tell us to skip this file on @cgrp? */
+ if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+ continue;
if ((cft->flags & __CFTYPE_ONLY_ON_DFL) && !cgroup_on_dfl(cgrp))
continue;
if ((cft->flags & __CFTYPE_NOT_ON_DFL) && cgroup_on_dfl(cgrp))
@@ -3959,6 +4063,9 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
WARN_ON(cft->ss || cft->kf_ops);
+ if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+ continue;
+
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
@@ -4861,12 +4968,18 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_freeze_write,
},
{
+ .name = "cgroup.kill",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .write = cgroup_kill_write,
+ },
+ {
.name = "cpu.stat",
.seq_show = cpu_stat_show,
},
#ifdef CONFIG_PSI
{
.name = "io.pressure",
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4874,6 +4987,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4881,6 +4995,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
+ .flags = CFTYPE_PRESSURE,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -6080,6 +6195,8 @@ void cgroup_post_fork(struct task_struct *child,
struct kernel_clone_args *kargs)
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned long cgrp_flags = 0;
+ bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
@@ -6091,6 +6208,11 @@ void cgroup_post_fork(struct task_struct *child,
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
+ if (kargs->cgrp)
+ cgrp_flags = kargs->cgrp->flags;
+ else
+ cgrp_flags = cset->dfl_cgrp->flags;
+
WARN_ON_ONCE(!list_empty(&child->cg_list));
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
@@ -6099,23 +6221,32 @@ void cgroup_post_fork(struct task_struct *child,
cset = NULL;
}
- /*
- * If the cgroup has to be frozen, the new task has too. Let's set
- * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
- * frozen state.
- */
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
+ if (!(child->flags & PF_KTHREAD)) {
+ if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+ /*
+ * If the cgroup has to be frozen, the new task has
+ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+ * get the task into the frozen state.
+ */
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient
+ * switch from the frozen state and back.
+ */
+ }
/*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later from
- * do_freezer_trap(). So we avoid cgroup's transient switch
- * from the frozen state and back.
+ * If the cgroup is to be killed notice it now and take the
+ * child down right after we finished preparing it for
+ * userspace.
*/
+ kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
@@ -6138,6 +6269,10 @@ void cgroup_post_fork(struct task_struct *child,
put_css_set(rcset);
}
+ /* Cgroup has to be killed so take down child immediately. */
+ if (unlikely(kill))
+ do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
cgroup_css_set_put_fork(kargs);
}
@@ -6163,7 +6298,8 @@ void cgroup_exit(struct task_struct *tsk)
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
- if (unlikely(cgroup_task_freeze(tsk)))
+ if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+ test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
@@ -6214,6 +6350,15 @@ static int __init cgroup_disable(char *str)
pr_info("Disabling %s control group subsystem\n",
ss->name);
}
+
+ for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+ if (strcmp(token, cgroup_opt_feature_names[i]))
+ continue;
+ cgroup_feature_disable_mask |= 1 << i;
+ pr_info("Disabling %s control group feature\n",
+ cgroup_opt_feature_names[i]);
+ break;
+ }
}
return 1;
}
@@ -6512,6 +6657,9 @@ static ssize_t show_delegatable_files(struct cftype *files, char *buf,
if (!(cft->flags & CFTYPE_NS_DELEGATABLE))
continue;
+ if ((cft->flags & CFTYPE_PRESSURE) && !cgroup_psi_enabled())
+ continue;
+
if (prefix)
ret += snprintf(buf + ret, size - ret, "%s.", prefix);
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index cee265cb535c..7f0e58917432 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -220,7 +220,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
}
/**
- * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
* Flush stats in @cgrp's subtree and prevent further flushes. Must be
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 58b36d17a09a..1652f2bb54b7 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -148,6 +148,7 @@
static int psi_bug __read_mostly;
DEFINE_STATIC_KEY_FALSE(psi_disabled);
+DEFINE_STATIC_KEY_TRUE(psi_cgroups_enabled);
#ifdef CONFIG_PSI_DEFAULT_DISABLED
static bool psi_enable;
@@ -215,6 +216,9 @@ void __init psi_init(void)
return;
}
+ if (!cgroup_psi_enabled())
+ static_branch_disable(&psi_cgroups_enabled);
+
psi_period = jiffies_to_nsecs(PSI_FREQ);
group_init(&psi_system);
}
@@ -748,23 +752,23 @@ static void psi_group_change(struct psi_group *group, int cpu,
static struct psi_group *iterate_groups(struct task_struct *task, void **iter)
{
+ if (*iter == &psi_system)
+ return NULL;
+
#ifdef CONFIG_CGROUPS
- struct cgroup *cgroup = NULL;
+ if (static_branch_likely(&psi_cgroups_enabled)) {
+ struct cgroup *cgroup = NULL;
- if (!*iter)
- cgroup = task->cgroups->dfl_cgrp;
- else if (*iter == &psi_system)
- return NULL;
- else
- cgroup = cgroup_parent(*iter);
+ if (!*iter)
+ cgroup = task->cgroups->dfl_cgrp;
+ else
+ cgroup = cgroup_parent(*iter);
- if (cgroup && cgroup_parent(cgroup)) {
- *iter = cgroup;
- return cgroup_psi(cgroup);
+ if (cgroup && cgroup_parent(cgroup)) {
+ *iter = cgroup;
+ return cgroup_psi(cgroup);
+ }
}
-#else
- if (*iter)
- return NULL;
#endif
*iter = &psi_system;
return &psi_system;