aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup')
-rw-r--r--kernel/cgroup/Makefile1
-rw-r--r--kernel/cgroup/cgroup-internal.h24
-rw-r--r--kernel/cgroup/cgroup-v1.c156
-rw-r--r--kernel/cgroup/cgroup.c1586
-rw-r--r--kernel/cgroup/cpuset.c1179
-rw-r--r--kernel/cgroup/freezer.c2
-rw-r--r--kernel/cgroup/legacy_freezer.c23
-rw-r--r--kernel/cgroup/misc.c424
-rw-r--r--kernel/cgroup/namespace.c9
-rw-r--r--kernel/cgroup/pids.c52
-rw-r--r--kernel/cgroup/rdma.c2
-rw-r--r--kernel/cgroup/rstat.c260
12 files changed, 2710 insertions, 1008 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile
index 5d7a76bfbbb7..12f8457ad1f9 100644
--- a/kernel/cgroup/Makefile
+++ b/kernel/cgroup/Makefile
@@ -5,4 +5,5 @@ obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o
obj-$(CONFIG_CGROUP_PIDS) += pids.o
obj-$(CONFIG_CGROUP_RDMA) += rdma.o
obj-$(CONFIG_CPUSETS) += cpuset.o
+obj-$(CONFIG_CGROUP_MISC) += misc.o
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index bfbeabc17a9d..fd4020835ec6 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -12,7 +12,6 @@
#define TRACE_CGROUP_PATH_LEN 1024
extern spinlock_t trace_cgroup_path_lock;
extern char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-extern bool cgroup_debug;
extern void __init enable_debug_cgroup(void);
/*
@@ -65,6 +64,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
@@ -146,7 +164,6 @@ struct cgroup_mgctx {
#define DEFINE_CGROUP_MGCTX(name) \
struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name)
-extern struct mutex cgroup_mutex;
extern spinlock_t css_set_lock;
extern struct cgroup_subsys *cgroup_subsys[];
extern struct list_head cgroup_roots;
@@ -215,6 +232,7 @@ void cgroup_kn_unlock(struct kernfs_node *kn);
int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen,
struct cgroup_namespace *ns);
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor);
void cgroup_free_root(struct cgroup_root *root);
void init_cgroup_root(struct cgroup_fs_context *ctx);
int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask);
@@ -231,6 +249,8 @@ int cgroup_migrate(struct task_struct *leader, bool threadgroup,
int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
bool threadgroup);
+void cgroup_attach_lock(bool lock_threadgroup);
+void cgroup_attach_unlock(bool lock_threadgroup);
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
bool *locked)
__acquires(&cgroup_threadgroup_rwsem);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index f2d7cea86ffe..52bb5a74a23b 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -38,10 +38,7 @@ static bool cgroup_no_v1_named;
*/
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
-/*
- * Protects cgroup_subsys->release_agent_path. Modifying it also requires
- * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
- */
+/* protects cgroup_subsys->release_agent_path */
static DEFINE_SPINLOCK(release_agent_path_lock);
bool cgroup1_ssid_disabled(int ssid)
@@ -53,6 +50,8 @@ bool cgroup1_ssid_disabled(int ssid)
* cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from'
* @from: attach to all cgroups of a given task
* @tsk: the task to be attached
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
{
@@ -60,13 +59,10 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
int retval = 0;
mutex_lock(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_lock(true);
for_each_root(root) {
struct cgroup *from_cgrp;
- if (root == &cgrp_dfl_root)
- continue;
-
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
@@ -75,7 +71,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
if (retval)
break;
}
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(true);
mutex_unlock(&cgroup_mutex);
return retval;
@@ -83,7 +79,7 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/**
- * cgroup_trasnsfer_tasks - move tasks from one cgroup to another
+ * cgroup_transfer_tasks - move tasks from one cgroup to another
* @to: cgroup to which the tasks will be moved
* @from: cgroup in which the tasks currently reside
*
@@ -92,6 +88,8 @@ EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
* is guaranteed to be either visible in the source cgroup after the
* parent's migration is complete or put into the target cgroup. No task
* can slip out of migration through forking.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from)
{
@@ -396,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -405,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -451,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -462,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -506,10 +506,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
@@ -545,9 +546,19 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct cgroup *cgrp;
+ struct cgroup_file_ctx *ctx;
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ ctx = of->priv;
+ if ((ctx->ns->user_ns != &init_user_ns) ||
+ !file_ns_capable(of->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -661,11 +672,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
- * ideally we don't want subsystems moving around while we do this.
- * cgroup_mutex is also necessary to guarantee an atomic snapshot of
- * subsys/hierarchy state.
+ * Grab the subsystems state racily. No need to add avenue to
+ * cgroup_mutex contention.
*/
- mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
@@ -673,7 +682,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
- mutex_unlock(&cgroup_mutex);
return 0;
}
@@ -685,6 +693,8 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
*
* Build and fill cgroupstats so that taskstats can export it to user
* space.
+ *
+ * Return: %0 on success or a negative errno code on failure
*/
int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
{
@@ -698,8 +708,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
- mutex_lock(&cgroup_mutex);
-
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
@@ -707,16 +715,15 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
- if (!cgrp || cgroup_is_dead(cgrp)) {
+ if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
- mutex_unlock(&cgroup_mutex);
return -ENOENT;
}
rcu_read_unlock();
css_task_iter_start(&cgrp->self, 0, &it);
while ((tsk = css_task_iter_next(&it))) {
- switch (tsk->state) {
+ switch (READ_ONCE(tsk->__state)) {
case TASK_RUNNING:
stats->nr_running++;
break;
@@ -730,14 +737,14 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
stats->nr_stopped++;
break;
default:
- if (delayacct_is_task_waiting_on_io(tsk))
+ if (tsk->in_iowait)
stats->nr_io_wait++;
break;
}
}
css_task_iter_end(&it);
- mutex_unlock(&cgroup_mutex);
+ cgroup_put(cgrp);
return 0;
}
@@ -775,22 +782,29 @@ void cgroup1_release_agent(struct work_struct *work)
{
struct cgroup *cgrp =
container_of(work, struct cgroup, release_agent_work);
- char *pathbuf = NULL, *agentbuf = NULL;
+ char *pathbuf, *agentbuf;
char *argv[3], *envp[3];
int ret;
- mutex_lock(&cgroup_mutex);
+ /* snoop agent path and exit early if empty */
+ if (!cgrp->root->release_agent_path[0])
+ return;
+ /* prepare argument buffers */
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!pathbuf || !agentbuf || !strlen(agentbuf))
- goto out;
+ agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out_free;
- spin_lock_irq(&css_set_lock);
- ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
- spin_unlock_irq(&css_set_lock);
+ spin_lock(&release_agent_path_lock);
+ strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
+ spin_unlock(&release_agent_path_lock);
+ if (!agentbuf[0])
+ goto out_free;
+
+ ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
if (ret < 0 || ret >= PATH_MAX)
- goto out;
+ goto out_free;
argv[0] = agentbuf;
argv[1] = pathbuf;
@@ -801,11 +815,7 @@ void cgroup1_release_agent(struct work_struct *work)
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
- mutex_unlock(&cgroup_mutex);
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- goto out_free;
-out:
- mutex_unlock(&cgroup_mutex);
out_free:
kfree(agentbuf);
kfree(pathbuf);
@@ -820,6 +830,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent
struct cgroup *cgrp = kn->priv;
int ret;
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(new_name_str, '\n'))
+ return -EINVAL;
+
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
if (kn->parent != new_parent)
@@ -861,6 +875,8 @@ static int cgroup1_show_options(struct seq_file *seq, struct kernfs_root *kf_roo
seq_puts(seq, ",xattr");
if (root->flags & CGRP_ROOT_CPUSET_V2_MODE)
seq_puts(seq, ",cpuset_v2_mode");
+ if (root->flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
@@ -884,6 +900,8 @@ enum cgroup1_param {
Opt_noprefix,
Opt_release_agent,
Opt_xattr,
+ Opt_favordynmods,
+ Opt_nofavordynmods,
};
const struct fs_parameter_spec cgroup1_fs_parameters[] = {
@@ -895,6 +913,8 @@ const struct fs_parameter_spec cgroup1_fs_parameters[] = {
fsparam_flag ("noprefix", Opt_noprefix),
fsparam_string("release_agent", Opt_release_agent),
fsparam_flag ("xattr", Opt_xattr),
+ fsparam_flag ("favordynmods", Opt_favordynmods),
+ fsparam_flag ("nofavordynmods", Opt_nofavordynmods),
{}
};
@@ -907,14 +927,17 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
opt = fs_parse(fc, cgroup1_fs_parameters, param, &result);
if (opt == -ENOPARAM) {
- if (strcmp(param->key, "source") == 0) {
- fc->source = param->string;
- param->string = NULL;
- return 0;
- }
+ int ret;
+
+ ret = vfs_parse_fs_param_source(fc, param);
+ if (ret != -ENOPARAM)
+ return ret;
for_each_subsys(ss, i) {
if (strcmp(param->key, ss->legacy_name))
continue;
+ if (!cgroup_ssid_enabled(i) || cgroup1_ssid_disabled(i))
+ return invalfc(fc, "Disabled controller '%s'",
+ param->key);
ctx->subsys_mask |= (1 << i);
return 0;
}
@@ -943,10 +966,22 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_xattr:
ctx->flags |= CGRP_ROOT_XATTR;
break;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ break;
+ case Opt_nofavordynmods:
+ ctx->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ break;
case Opt_release_agent:
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return invalfc(fc, "release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return invalfc(fc, "Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
@@ -996,7 +1031,7 @@ static int check_cgroupfs_options(struct fs_context *fc)
ctx->subsys_mask &= enabled;
/*
- * In absense of 'none', 'name=' or subsystem name options,
+ * In absence of 'none', 'name=' and subsystem name options,
* let's default to 'all'.
*/
if (!ctx->subsys_mask && !ctx->none && !ctx->name)
@@ -1188,8 +1223,11 @@ static int cgroup1_root_to_use(struct fs_context *fc)
init_cgroup_root(ctx);
ret = cgroup_setup_root(root, ctx->subsys_mask);
- if (ret)
+ if (!ret)
+ cgroup_favor_dynmods(root, ctx->flags & CGRP_ROOT_FAVOR_DYNMODS);
+ else
cgroup_free_root(root);
+
return ret;
}
@@ -1214,9 +1252,7 @@ int cgroup1_get_tree(struct fs_context *fc)
ret = cgroup_do_get_tree(fc);
if (!ret && percpu_ref_is_dying(&ctx->root->cgrp.self.refcnt)) {
- struct super_block *sb = fc->root->d_sb;
- dput(fc->root);
- deactivate_locked_super(sb);
+ fc_drop_locked(fc);
ret = 1;
}
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3dead0416b91..2319946715e0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -68,6 +69,14 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -87,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -153,11 +162,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
-/*
- * The default hierarchy, reserved for the subsystems that are otherwise
- * unattached - it never has more than a single cgroup, and all tasks are
- * part of that cgroup.
- */
+/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
@@ -203,7 +208,7 @@ static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = REFCOUNT_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
@@ -212,6 +217,23 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
+
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+ OPT_FEATURE_PRESSURE,
+#endif
+ OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+ "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
@@ -236,7 +258,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
bool cgroup_ssid_enabled(int ssid)
{
- if (CGROUP_SUBSYS_COUNT == 0)
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
@@ -248,12 +270,9 @@ bool cgroup_ssid_enabled(int ssid)
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
+ * cases where a subsystem should behave differently depending on the
* interface version.
*
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
@@ -261,15 +280,13 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
- * recycled inbetween reads.
+ * recycled in-between reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
@@ -288,9 +305,6 @@ bool cgroup_ssid_enabled(int ssid)
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- * is not created.
- *
* - blkcg: blk-throttle becomes properly hierarchical.
*
* - debug: disallowed on the default hierarchy.
@@ -352,7 +366,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp)
return !cgroup_parent(cgrp);
}
-/* can @cgrp become a thread root? should always be true for a thread root */
+/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
/* mixables don't care */
@@ -466,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
- if (ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
@@ -478,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
- * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
@@ -537,13 +551,16 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
* the root css is returned, so this function always returns a valid css.
*
* The returned css is not guaranteed to be online, and therefore it is the
- * callers responsiblity to tryget a reference for it.
+ * callers responsibility to try get a reference for it.
*/
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
do {
css = cgroup_css(cgrp, ss);
@@ -571,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
rcu_read_lock();
do {
@@ -587,6 +607,7 @@ out_unlock:
rcu_read_unlock();
return css;
}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
@@ -640,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
- if (cft->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
@@ -688,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css);
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
- if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
@@ -709,7 +730,7 @@ EXPORT_SYMBOL_GPL(of_css);
; \
else
-/* walk live descendants in preorder */
+/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
@@ -743,7 +764,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -943,7 +965,7 @@ void put_css_set_locked(struct css_set *cset)
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
- /* This css_set is dead. unlink it and release cgroup and css refs */
+ /* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
@@ -1068,7 +1090,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
/*
* Build the set of subsystem state objects that we want to see in the
- * new css_set. while subsystems can change globally, the entries here
+ * new css_set. While subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
@@ -1158,7 +1180,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
/*
* Always add links to the tail of the lists so that the lists are
- * in choronological order.
+ * in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
@@ -1218,7 +1240,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1280,11 +1303,25 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ if (favor && !favoring) {
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1345,82 +1382,98 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_root_count--;
}
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
+ cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
{
- struct cgroup *res = NULL;
- struct css_set *cset;
-
- lockdep_assert_held(&css_set_lock);
-
- rcu_read_lock();
+ struct cgroup *res_cgroup = NULL;
- cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
- res = &root->cgrp;
+ res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
+ res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
- res = c;
+ res_cgroup = c;
break;
}
}
}
- rcu_read_unlock();
- BUG_ON(!res);
- return res;
+ BUG_ON(!res_cgroup);
+ return res_cgroup;
}
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
+ struct css_set *cset;
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ rcu_read_lock();
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ rcu_read_unlock();
- BUG_ON(!res);
return res;
}
/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
@@ -1642,7 +1695,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
- * @css: taget css
+ * @css: target css
*/
static void css_clear_dir(struct cgroup_subsys_state *css)
{
@@ -1655,12 +1708,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1683,14 +1740,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -1718,6 +1783,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
+ u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1734,8 +1800,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
} while_each_subsys_mask();
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
@@ -1744,10 +1830,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */
- src_root->subsys_mask &= ~(1 << ssid);
- WARN_ON(cgroup_apply_control(scgrp));
- cgroup_finalize_control(scgrp, 0);
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -1761,6 +1849,13 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
&dcgrp->e_csets[ss->id]);
spin_unlock_irq(&css_set_lock);
+ if (ss->css_rstat_flush) {
+ list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
+ list_add_rcu(&css->rstat_css_node,
+ &dcgrp->rstat_css_list);
+ }
+
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -1812,13 +1907,17 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
+ Opt_memory_recursiveprot,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
};
@@ -1836,9 +1935,15 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
+ case Opt_memory_recursiveprot:
+ ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ return 0;
}
return -EINVAL;
}
@@ -1851,10 +1956,18 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+
+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
}
}
@@ -1862,8 +1975,12 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ seq_puts(seq, ",memory_recursiveprot");
return 0;
}
@@ -1910,7 +2027,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -1954,24 +2072,29 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
- KERNFS_ROOT_SUPPORT_EXPORTOP,
+ KERNFS_ROOT_SUPPORT_EXPORTOP |
+ KERNFS_ROOT_SUPPORT_USER_XATTR,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
- ret = rebind_subsystems(root, ss_mask);
+ ret = cgroup_rstat_init(root_cgrp);
if (ret)
goto destroy_root;
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto exit_stats;
+
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
@@ -2000,10 +2123,11 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
+exit_stats:
+ cgroup_rstat_exit(root_cgrp);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -2080,7 +2204,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2126,6 +2250,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+#endif
return 0;
}
@@ -2137,13 +2265,14 @@ static void cgroup_kill_sb(struct super_block *sb)
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
- * cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2263,7 +2392,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+ ret = strscpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
@@ -2273,6 +2402,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2341,7 +2511,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
- while (&cset->mg_node != tset->csets) {
+ while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
@@ -2374,7 +2544,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -2499,10 +2669,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2526,21 +2692,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2580,11 +2752,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -2593,7 +2765,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2618,7 +2790,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2637,7 +2809,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2645,8 +2817,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2714,11 +2886,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
{
DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
- int ret;
-
- ret = cgroup_migrate_vet_dst(dst_cgrp);
- if (ret)
- return ret;
+ int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2746,8 +2914,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2764,12 +2931,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2800,17 +2963,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2818,8 +2978,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2874,29 +3034,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ cgroup_attach_lock(has_tasks);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2908,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -3145,11 +3323,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3542,30 +3716,32 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
+ struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
@@ -3574,14 +3750,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3591,33 +3773,117 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
}
+
+bool cgroup_psi_enabled(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
+ return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+ return false;
+}
+
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3654,32 +3920,128 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ set_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ /* Skip tasks that are already dying. */
+ if (__fatal_signal_pending(task))
+ continue;
+
+ send_sig(SIGKILL, task, 0);
+ }
+ css_task_iter_end(&it);
+
+ spin_lock_irq(&css_set_lock);
+ clear_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+ __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ ssize_t ret = 0;
+ int kill;
+ struct cgroup *cgrp;
+
+ ret = kstrtoint(strstrip(buf), 0, &kill);
+ if (ret)
+ return ret;
+
+ if (kill != 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /*
+ * Killing is a process directed operation, i.e. the whole thread-group
+ * is taken down so act like we do for cgroup.procs and only make this
+ * writable in non-threaded cgroups.
+ */
+ if (cgroup_is_threaded(cgrp))
+ ret = -EOPNOTSUPP;
+ else
+ cgroup_kill(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
+ if (!nbytes)
+ return 0;
+
/*
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
@@ -3688,7 +4050,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -3723,7 +4085,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
if (cft->poll)
return cft->poll(of, pt);
@@ -3929,19 +4291,26 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
+
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
@@ -3954,26 +4323,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
@@ -3995,6 +4364,12 @@ int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
@@ -4100,6 +4475,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4133,7 +4528,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
- * have dropped rcu_read_lock() inbetween iterations.
+ * have dropped rcu_read_lock() in-between iterations.
*
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
@@ -4148,7 +4543,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
- list_for_each_entry_rcu(next, &parent->children, sibling)
+ list_for_each_entry_rcu(next, &parent->children, sibling,
+ lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr)
break;
}
@@ -4380,7 +4776,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
}
/**
- * css_task_iter_advance_css_set - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task iterator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
@@ -4391,29 +4787,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
- /* Advance to the next non-empty css_set */
- do {
- cset = css_task_iter_next_css_set(it);
- if (!cset) {
- it->task_pos = NULL;
- return;
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+ while ((cset = css_task_iter_next_css_set(it))) {
+ if (!list_empty(&cset->tasks)) {
+ it->cur_tasks_head = &cset->tasks;
+ break;
+ } else if (!list_empty(&cset->mg_tasks)) {
+ it->cur_tasks_head = &cset->mg_tasks;
+ break;
+ } else if (!list_empty(&cset->dying_tasks)) {
+ it->cur_tasks_head = &cset->dying_tasks;
+ break;
}
- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-
- if (!list_empty(&cset->tasks)) {
- it->task_pos = cset->tasks.next;
- it->cur_tasks_head = &cset->tasks;
- } else if (!list_empty(&cset->mg_tasks)) {
- it->task_pos = cset->mg_tasks.next;
- it->cur_tasks_head = &cset->mg_tasks;
- } else {
- it->task_pos = cset->dying_tasks.next;
- it->cur_tasks_head = &cset->dying_tasks;
}
-
- it->tasks_head = &cset->tasks;
- it->mg_tasks_head = &cset->mg_tasks;
- it->dying_tasks_head = &cset->dying_tasks;
+ if (!cset) {
+ it->task_pos = NULL;
+ return;
+ }
+ it->task_pos = it->cur_tasks_head->next;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4458,24 +4849,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
repeat:
if (it->task_pos) {
/*
- * Advance iterator to find next entry. cset->tasks is
- * consumed first and then ->mg_tasks. After ->mg_tasks,
- * we move onto the next cset.
+ * Advance iterator to find next entry. We go through cset
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
+ * the next cset.
*/
if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head) {
- it->task_pos = it->mg_tasks_head->next;
- it->cur_tasks_head = it->mg_tasks_head;
+ if (it->task_pos == &it->cur_cset->tasks) {
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->mg_tasks_head) {
- it->task_pos = it->dying_tasks_head->next;
- it->cur_tasks_head = it->dying_tasks_head;
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->dying_tasks_head)
+ if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it);
} else {
/* called from start, proceed to the first cset */
@@ -4493,12 +4884,12 @@ repeat:
goto repeat;
/* and dying leaders w/o live member threads */
- if (it->cur_tasks_head == it->dying_tasks_head &&
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (it->cur_tasks_head == it->dying_tasks_head)
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
goto repeat;
}
}
@@ -4524,7 +4915,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->ss = css->ss;
it->flags = flags;
- if (it->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
@@ -4593,21 +4984,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4615,21 +5006,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4662,13 +5050,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
return 0;
}
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+{
+ int ret;
+ struct inode *inode;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ iput(inode);
+ return ret;
+}
+
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
- struct inode *inode;
int ret;
lockdep_assert_held(&cgroup_mutex);
@@ -4678,12 +5081,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
+ ret = cgroup_may_write(com_cgrp, sb);
if (ret)
return ret;
@@ -4699,19 +5097,42 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
return 0;
}
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
+ int ret = 0;
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
+ if (ret)
+ return ret;
+
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
+
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+ ret = -EOPNOTSUPP;
+
+ return ret;
+}
+
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4721,19 +5142,33 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
- ret = cgroup_attach_task(dst_cgrp, task, true);
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
@@ -4744,46 +5179,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct cgroup *src_cgrp, *dst_cgrp;
- struct task_struct *task;
- ssize_t ret;
- bool locked;
-
- buf = strstrip(buf);
-
- dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!dst_cgrp)
- return -ENODEV;
-
- task = cgroup_procs_write_start(buf, false, &locked);
- ret = PTR_ERR_OR_ZERO(task);
- if (ret)
- goto out_unlock;
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
- if (ret)
- goto out_finish;
-
- /* and must be contained in the same domain */
- ret = -EOPNOTSUPP;
- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
- goto out_finish;
-
- ret = cgroup_attach_task(dst_cgrp, task, false);
-
-out_finish:
- cgroup_procs_write_finish(task, locked);
-out_unlock:
- cgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
@@ -4850,13 +5246,22 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_freeze_write,
},
{
- .name = "cpu.stat",
+ .name = "cgroup.kill",
.flags = CFTYPE_NOT_ON_ROOT,
+ .write = cgroup_kill_write,
+ },
+ {
+ .name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4864,6 +5269,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4871,11 +5277,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -4938,8 +5360,7 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4980,8 +5401,7 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_flush(cgrp);
+ cgroup_rstat_flush(cgrp);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5038,7 +5458,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+ if (ss->css_rstat_flush)
list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
BUG_ON(cgroup_css(cgrp, ss));
@@ -5128,15 +5548,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
if (err)
goto err_list_del;
- if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
- cgroup_parent(parent)) {
- pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
- if (!strcmp(ss->name, "memory"))
- pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
- ss->warned_broken_hierarchy = true;
- }
-
return css;
err_list_del:
@@ -5163,8 +5574,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5172,11 +5582,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- if (cgroup_on_dfl(parent)) {
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
- }
+ ret = cgroup_rstat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
@@ -5218,7 +5626,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5263,8 +5671,7 @@ out_psi_free:
out_kernfs_remove:
kernfs_remove(cgrp->kn);
out_stat_exit:
- if (cgroup_on_dfl(parent))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5355,7 +5762,7 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css().
*/
static void css_killed_work_fn(struct work_struct *work)
{
@@ -5499,7 +5906,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
- if (parent && cgroup_is_threaded(cgrp))
+ if (cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
@@ -5562,7 +5969,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -5639,8 +6046,6 @@ int __init cgroup_init_early(void)
return 0;
}
-static u16 cgroup_disable_mask __initdata;
-
/**
* cgroup_init - cgroup initialization
*
@@ -5654,16 +6059,11 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@@ -5699,12 +6099,8 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (cgroup_disable_mask & (1 << ssid)) {
- static_branch_disable(cgroup_subsys_enabled_key[ssid]);
- printk(KERN_INFO "Disabling %s control group subsystem\n",
- ss->name);
+ if (!cgroup_ssid_enabled(ssid))
continue;
- }
if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
@@ -5782,6 +6178,48 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp, *root_cgrp;
+
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
+
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
+ cgrp = NULL;
+
+ rcu_read_unlock();
+ kernfs_put(kn);
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_id);
+
+/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
@@ -5806,7 +6244,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -5864,8 +6302,7 @@ out:
* @child: pointer to task_struct of forking parent process.
*
* A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set. Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
+ * attaches it to the target css_set.
*/
void cgroup_fork(struct task_struct *child)
{
@@ -5874,20 +6311,208 @@ void cgroup_fork(struct task_struct *child)
}
/**
- * cgroup_can_fork - called on a new task before the process is exposed
- * @child: the task in question.
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
*
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
- * returns an error, the fork aborts with that error code. This allows for
- * a cgroup subsystem to conditionally allow or deny new forks.
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
*/
-int cgroup_can_fork(struct task_struct *child)
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+
+/**
+ * cgroup_css_set_fork - find or create a css_set for a child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This functions finds or creates a new css_set which the child
+ * process will be attached to in cgroup_post_fork(). By default,
+ * the child process will be given the same css_set as its parent.
+ *
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
+ * existing css_set which includes the requested cgroup and if not create
+ * a new css_set that the child will be attached to later. If this function
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * to the target cgroup.
+ */
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+{
+ int ret;
+ struct cgroup *dst_cgrp = NULL;
+ struct css_set *cset;
+ struct super_block *sb;
+ struct file *f;
+
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ mutex_lock(&cgroup_mutex);
+
+ cgroup_threadgroup_change_begin(current);
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+ kargs->cset = cset;
+ return 0;
+ }
+
+ f = fget_raw(kargs->cgroup);
+ if (!f) {
+ ret = -EBADF;
+ goto err;
+ }
+ sb = f->f_path.dentry->d_sb;
+
+ dst_cgrp = cgroup_get_from_file(f);
+ if (IS_ERR(dst_cgrp)) {
+ ret = PTR_ERR(dst_cgrp);
+ dst_cgrp = NULL;
+ goto err;
+ }
+
+ if (cgroup_is_dead(dst_cgrp)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ /*
+ * Verify that we the target cgroup is writable for us. This is
+ * usually done by the vfs layer but since we're not going through
+ * the vfs layer here we need to do it "manually".
+ */
+ ret = cgroup_may_write(dst_cgrp, sb);
+ if (ret)
+ goto err;
+
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
+ if (ret)
+ goto err;
+
+ kargs->cset = find_css_set(cset, dst_cgrp);
+ if (!kargs->cset) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ put_css_set(cset);
+ fput(f);
+ kargs->cgrp = dst_cgrp;
+ return ret;
+
+err:
+ cgroup_threadgroup_change_end(current);
+ mutex_unlock(&cgroup_mutex);
+ if (f)
+ fput(f);
+ if (dst_cgrp)
+ cgroup_put(dst_cgrp);
+ put_css_set(cset);
+ if (kargs->cset)
+ put_css_set(kargs->cset);
+ return ret;
+}
+
+/**
+ * cgroup_css_set_put_fork - drop references we took during fork
+ * @kargs: the arguments passed to create the child process
+ *
+ * Drop references to the prepared css_set and target cgroup if
+ * CLONE_INTO_CGROUP was requested.
+ */
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+ cgroup_threadgroup_change_end(current);
+
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
+
+ mutex_unlock(&cgroup_mutex);
+
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+
+ if (cgrp) {
+ cgroup_put(cgrp);
+ kargs->cgrp = NULL;
+ }
+ }
+}
+
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This prepares a new css_set for the child process which the child will
+ * be attached to in cgroup_post_fork().
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+ * callback returns an error, the fork aborts with that error code. This
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i, j, ret;
+ ret = cgroup_css_set_fork(kargs);
+ if (ret)
+ return ret;
+
do_each_subsys_mask(ss, i, have_canfork_callback) {
- ret = ss->can_fork(child);
+ ret = ss->can_fork(child, kargs->cset);
if (ret)
goto out_revert;
} while_each_subsys_mask();
@@ -5899,73 +6524,100 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
}
+ cgroup_css_set_put_fork(kargs);
+
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
- * @child: the task in question
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded.
+ * cgroup_can_fork() succeeded and cleans up references we took to
+ * prepare a new css_set for the child process in cgroup_can_fork().
*/
-void cgroup_cancel_fork(struct task_struct *child)
+void cgroup_cancel_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
+
+ cgroup_css_set_put_fork(kargs);
}
/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
- *
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks. Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
+ * cgroup_post_fork - finalize cgroup setup for the child process
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * Attach the child process to its css_set calling the subsystem fork()
+ * callbacks.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned long cgrp_flags = 0;
+ bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
+ cset = kargs->cset;
+ kargs->cset = NULL;
+
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
+ if (kargs->cgrp)
+ cgrp_flags = kargs->cgrp->flags;
+ else
+ cgrp_flags = cset->dfl_cgrp->flags;
+
WARN_ON_ONCE(!list_empty(&child->cg_list));
- cset = task_css_set(current); /* current is @child's parent */
- get_css_set(cset);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
+ } else {
+ put_css_set(cset);
+ cset = NULL;
}
- /*
- * If the cgroup has to be frozen, the new task has too. Let's set
- * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
- * frozen state.
- */
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
+ if (!(child->flags & PF_KTHREAD)) {
+ if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+ /*
+ * If the cgroup has to be frozen, the new task has
+ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+ * get the task into the frozen state.
+ */
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient
+ * switch from the frozen state and back.
+ */
+ }
/*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later from
- * do_freezer_trap(). So we avoid cgroup's transient switch
- * from the frozen state and back.
+ * If the cgroup is to be killed notice it now and take the
+ * child down right after we finished preparing it for
+ * userspace.
*/
+ kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
@@ -5978,6 +6630,21 @@ void cgroup_post_fork(struct task_struct *child)
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
+
+ /* Make the new cset the root_cset of the new cgroup namespace. */
+ if (kargs->flags & CLONE_NEWCGROUP) {
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+
+ get_css_set(cset);
+ child->nsproxy->cgroup_ns->root_cset = cset;
+ put_css_set(rcset);
+ }
+
+ /* Cgroup has to be killed so take down child immediately. */
+ if (unlikely(kill))
+ do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
+ cgroup_css_set_put_fork(kargs);
}
/**
@@ -6002,7 +6669,8 @@ void cgroup_exit(struct task_struct *tsk)
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
- if (unlikely(cgroup_task_freeze(tsk)))
+ if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+ test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
@@ -6048,7 +6716,19 @@ static int __init cgroup_disable(char *str)
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
- cgroup_disable_mask |= 1 << i;
+
+ static_branch_disable(cgroup_subsys_enabled_key[i]);
+ pr_info("Disabling %s control group subsystem\n",
+ ss->name);
+ }
+
+ for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+ if (strcmp(token, cgroup_opt_feature_names[i]))
+ continue;
+ cgroup_feature_disable_mask |= 1 << i;
+ pr_info("Disabling %s control group feature\n",
+ cgroup_opt_feature_names[i]);
+ break;
}
}
return 1;
@@ -6125,46 +6805,51 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
- struct cgroup *cgrp;
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- mutex_lock(&cgroup_mutex);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ if (!kn)
+ goto out;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
- if (kn) {
- if (kernfs_type(kn) == KERNFS_DIR) {
- cgrp = kn->priv;
- cgroup_get_live(cgrp);
- } else {
- cgrp = ERR_PTR(-ENOTDIR);
- }
- kernfs_put(kn);
- } else {
- cgrp = ERR_PTR(-ENOENT);
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
}
- mutex_unlock(&cgroup_mutex);
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
- struct cgroup_subsys_state *css;
struct cgroup *cgrp;
struct file *f;
@@ -6172,17 +6857,27 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f)
return ERR_PTR(-EBADF);
- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ cgrp = cgroup_v1v2_get_from_file(f);
fput(f);
- if (IS_ERR(css))
- return ERR_CAST(css);
+ return cgrp;
+}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
- cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
-
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@ -6235,106 +6930,56 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled)
- return;
-
- /* Socket clone path */
- if (skcd->val) {
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
-void cgroup_sk_free(struct sock_cgroup_data *skcd)
+void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- cgroup_bpf_put(cgrp);
- cgroup_put(cgrp);
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
-#endif /* CONFIG_SOCK_CGROUP_DATA */
-
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_prog *replace_prog, enum bpf_attach_type type,
- u32 flags)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
- int ret;
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ cgroup_bpf_put(cgrp);
+ cgroup_put(cgrp);
}
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- int ret;
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
+#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
@@ -6366,8 +7011,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
@@ -6381,7 +7029,11 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
+ return snprintf(buf, PAGE_SIZE,
+ "nsdelegate\n"
+ "favordynmods\n"
+ "memory_localevents\n"
+ "memory_recursiveprot\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 58f5073acff7..b474289c15b8 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -33,6 +33,7 @@
#include <linux/interrupt.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
+#include <linux/kthread.h>
#include <linux/list.h>
#include <linux/mempolicy.h>
#include <linux/mm.h>
@@ -69,6 +70,13 @@
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
+/*
+ * There could be abnormal cpuset configurations for cpu or memory
+ * node binding, add this key to provide a quick low-cost judgment
+ * of the situation.
+ */
+DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
+
/* See "Frequency meter" comments, below. */
struct fmeter {
@@ -78,6 +86,30 @@ struct fmeter {
spinlock_t lock; /* guards read or write of above */
};
+/*
+ * Invalid partition error code
+ */
+enum prs_errcode {
+ PERR_NONE = 0,
+ PERR_INVCPUS,
+ PERR_INVPARENT,
+ PERR_NOTPART,
+ PERR_NOTEXCL,
+ PERR_NOCPUS,
+ PERR_HOTPLUG,
+ PERR_CPUSEMPTY,
+};
+
+static const char * const perr_strings[] = {
+ [PERR_INVCPUS] = "Invalid cpu list in cpuset.cpus",
+ [PERR_INVPARENT] = "Parent is an invalid partition root",
+ [PERR_NOTPART] = "Parent is not a partition root",
+ [PERR_NOTEXCL] = "Cpu list in cpuset.cpus not exclusive",
+ [PERR_NOCPUS] = "Parent unable to distribute cpu downstream",
+ [PERR_HOTPLUG] = "No cpu available due to hotplug",
+ [PERR_CPUSEMPTY] = "cpuset.cpus is empty",
+};
+
struct cpuset {
struct cgroup_subsys_state css;
@@ -98,7 +130,7 @@ struct cpuset {
* and if it ends up empty, it will inherit the parent's mask.
*
*
- * On legacy hierachy:
+ * On legacy hierarchy:
*
* The user-configured masks are always the same with effective masks.
*/
@@ -160,25 +192,33 @@ struct cpuset {
*/
int use_parent_ecpus;
int child_ecpus_count;
+
+ /* Invalid partition error code, not lock protected */
+ enum prs_errcode prs_err;
+
+ /* Handle for cpuset.cpus.partition */
+ struct cgroup_file partition_file;
};
/*
* Partition root states:
*
- * 0 - not a partition root
- *
+ * 0 - member (not a partition root)
* 1 - partition root
- *
+ * 2 - partition root without load balancing (isolated)
* -1 - invalid partition root
- * None of the cpus in cpus_allowed can be put into the parent's
- * subparts_cpus. In this case, the cpuset is not a real partition
- * root anymore. However, the CPU_EXCLUSIVE bit will still be set
- * and the cpuset can be restored back to a partition root if the
- * parent cpuset can give more CPUs back to this child cpuset.
+ * -2 - invalid isolated partition root
*/
-#define PRS_DISABLED 0
-#define PRS_ENABLED 1
-#define PRS_ERROR -1
+#define PRS_MEMBER 0
+#define PRS_ROOT 1
+#define PRS_ISOLATED 2
+#define PRS_INVALID_ROOT -1
+#define PRS_INVALID_ISOLATED -2
+
+static inline bool is_prs_invalid(int prs_state)
+{
+ return prs_state < 0;
+}
/*
* Temporary cpumasks for working with partitions that are passed among
@@ -258,15 +298,43 @@ static inline int is_spread_slab(const struct cpuset *cs)
return test_bit(CS_SPREAD_SLAB, &cs->flags);
}
-static inline int is_partition_root(const struct cpuset *cs)
+static inline int is_partition_valid(const struct cpuset *cs)
{
return cs->partition_root_state > 0;
}
+static inline int is_partition_invalid(const struct cpuset *cs)
+{
+ return cs->partition_root_state < 0;
+}
+
+/*
+ * Callers should hold callback_lock to modify partition_root_state.
+ */
+static inline void make_partition_invalid(struct cpuset *cs)
+{
+ if (is_partition_valid(cs))
+ cs->partition_root_state = -cs->partition_root_state;
+}
+
+/*
+ * Send notification event of whenever partition_root_state changes.
+ */
+static inline void notify_partition_change(struct cpuset *cs, int old_prs)
+{
+ if (old_prs == cs->partition_root_state)
+ return;
+ cgroup_file_notify(&cs->partition_file);
+
+ /* Reset prs_err if not invalid */
+ if (is_partition_valid(cs))
+ WRITE_ONCE(cs->prs_err, PERR_NONE);
+}
+
static struct cpuset top_cpuset = {
.flags = ((1 << CS_ONLINE) | (1 << CS_CPU_EXCLUSIVE) |
(1 << CS_MEM_EXCLUSIVE)),
- .partition_root_state = PRS_ENABLED,
+ .partition_root_state = PRS_ROOT,
};
/**
@@ -298,17 +366,19 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_mutex and
+ * There are two global locks guarding cpuset structures - cpuset_rwsem and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment.
+ * comment. The cpuset code uses only cpuset_rwsem write lock. Other
+ * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
+ * prevent change to cpuset structures.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it
+ * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
* is the only task able to also acquire callback_lock and be able to
* modify cpusets. It can perform various checks on the cpuset structure
* first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_mutex. While it is performing these checks, various
+ * just holding cpuset_rwsem. While it is performing these checks, various
* callback routines can briefly acquire callback_lock to query cpusets.
* Once it is ready to make the changes, it takes callback_lock, blocking
* everyone else.
@@ -357,9 +427,24 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
+static inline void check_insane_mems_config(nodemask_t *nodes)
+{
+ if (!cpusets_insane_config() &&
+ movable_only_nodes(nodes)) {
+ static_branch_enable(&cpusets_insane_config_key);
+ pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
+ "Cpuset allocations might fail even with a lot of memory available.\n",
+ nodemask_pr_args(nodes));
+ }
+}
+
/*
- * Cgroup v2 behavior is used when on default hierarchy or the
- * cgroup_v2_mode flag is set.
+ * Cgroup v2 behavior is used on the "cpus" and "mems" control files when
+ * on default hierarchy or when the cpuset_v2_mode flag is set by mounting
+ * the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
+ * With v2 behavior, "cpus" and "mems" are always what the users have
+ * requested and won't be changed by hotplug events. Only the effective
+ * cpus or mems will be affected.
*/
static inline bool is_in_v2_mode(void)
{
@@ -367,33 +452,81 @@ static inline bool is_in_v2_mode(void)
(cpuset_cgrp_subsys.root->flags & CGRP_ROOT_CPUSET_V2_MODE);
}
+/**
+ * partition_is_populated - check if partition has tasks
+ * @cs: partition root to be checked
+ * @excluded_child: a child cpuset to be excluded in task checking
+ * Return: true if there are tasks, false otherwise
+ *
+ * It is assumed that @cs is a valid partition root. @excluded_child should
+ * be non-NULL when this cpuset is going to become a partition itself.
+ */
+static inline bool partition_is_populated(struct cpuset *cs,
+ struct cpuset *excluded_child)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *child;
+
+ if (cs->css.cgroup->nr_populated_csets)
+ return true;
+ if (!excluded_child && !cs->nr_subparts_cpus)
+ return cgroup_is_populated(cs->css.cgroup);
+
+ rcu_read_lock();
+ cpuset_for_each_child(child, css, cs) {
+ if (child == excluded_child)
+ continue;
+ if (is_partition_valid(child))
+ continue;
+ if (cgroup_is_populated(child->css.cgroup)) {
+ rcu_read_unlock();
+ return true;
+ }
+ }
+ rcu_read_unlock();
+ return false;
+}
+
/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+ cpumask_copy(pmask, cpu_online_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
* The top cpuset doesn't have any online cpu as a
* consequence of a race between cpuset_hotplug_work
* and cpu hotplug notifier. But we know the top
- * cpuset's effective_cpus is on its way to to be
+ * cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
- cpumask_copy(pmask, cpu_online_mask);
- return;
+ goto out_unlock;
}
}
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+ rcu_read_unlock();
}
/*
@@ -405,7 +538,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -417,7 +550,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_mutex held.
+ * Call with callback_lock or cpuset_rwsem held.
*/
static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
@@ -438,7 +571,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_mutex.
+ * are only set if the other's are set. Call holding cpuset_rwsem.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -541,13 +674,42 @@ static inline void free_cpuset(struct cpuset *cs)
}
/*
+ * validate_change_legacy() - Validate conditions specific to legacy (v1)
+ * behavior.
+ */
+static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
+{
+ struct cgroup_subsys_state *css;
+ struct cpuset *c, *par;
+ int ret;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* Each of our child cpusets must be a subset of us */
+ ret = -EBUSY;
+ cpuset_for_each_child(c, css, cur)
+ if (!is_cpuset_subset(c, trial))
+ goto out;
+
+ /* On legacy hierarchy, we must be a subset of our parent cpuset. */
+ ret = -EACCES;
+ par = parent_cs(cur);
+ if (par && !is_cpuset_subset(trial, par))
+ goto out;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
*
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_mutex held.
+ * cpuset_rwsem held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -564,44 +726,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
- int ret;
+ int ret = 0;
rcu_read_lock();
- /* Each of our child cpusets must be a subset of us */
- ret = -EBUSY;
- cpuset_for_each_child(c, css, cur)
- if (!is_cpuset_subset(c, trial))
- goto out;
+ if (!is_in_v2_mode())
+ ret = validate_change_legacy(cur, trial);
+ if (ret)
+ goto out;
/* Remaining checks don't apply to root cpuset */
- ret = 0;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
- /* On legacy hiearchy, we must be a subset of our parent cpuset. */
- ret = -EACCES;
- if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
- goto out;
-
- /*
- * If either I or some sibling (!= me) is exclusive, we can't
- * overlap
- */
- ret = -EINVAL;
- cpuset_for_each_child(c, css, par) {
- if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
- c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
- goto out;
- if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
- c != cur &&
- nodes_intersects(trial->mems_allowed, c->mems_allowed))
- goto out;
- }
-
/*
* Cpusets with tasks - existing or newly being attached - can't
* be changed to have empty cpus_allowed or mems_allowed.
@@ -626,6 +765,22 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
trial->cpus_allowed))
goto out;
+ /*
+ * If either I or some sibling (!= me) is exclusive, we can't
+ * overlap
+ */
+ ret = -EINVAL;
+ cpuset_for_each_child(c, css, par) {
+ if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
+ c != cur &&
+ cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ goto out;
+ if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
+ c != cur &&
+ nodes_intersects(trial->mems_allowed, c->mems_allowed))
+ goto out;
+ }
+
ret = 0;
out:
rcu_read_unlock();
@@ -670,7 +825,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_mutex held. */
+/* Must be called with cpuset_rwsem held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -696,7 +851,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_mutex held.
+ * Must be called with cpuset_rwsem held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -761,7 +916,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
update_domain_attr_tree(dattr, &top_cpuset);
}
cpumask_and(doms[0], top_cpuset.effective_cpus,
- housekeeping_cpumask(HK_FLAG_DOMAIN));
+ housekeeping_cpumask(HK_TYPE_DOMAIN));
goto done;
}
@@ -791,7 +946,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
if (!cpumask_empty(cp->cpus_allowed) &&
!(is_sched_load_balance(cp) &&
cpumask_intersects(cp->cpus_allowed,
- housekeeping_cpumask(HK_FLAG_DOMAIN))))
+ housekeeping_cpumask(HK_TYPE_DOMAIN))))
continue;
if (root_load_balance &&
@@ -803,7 +958,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
csa[csn++] = cp;
/* skip @cp's subtree if not a partition root */
- if (!is_partition_root(cp))
+ if (!is_partition_valid(cp))
pos_css = css_rightmost_descendant(pos_css);
}
rcu_read_unlock();
@@ -880,7 +1035,7 @@ restart:
if (apn == b->pn) {
cpumask_or(dp, dp, b->effective_cpus);
- cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
+ cpumask_and(dp, dp, housekeeping_cpumask(HK_TYPE_DOMAIN));
if (dattr)
update_domain_attr_tree(dattr + nslot, b);
@@ -975,29 +1130,52 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_mutex held. Takes get_online_cpus().
+ * Call with cpuset_rwsem held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
+ struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
+ struct cpuset *cs;
int ndoms;
lockdep_assert_cpus_held();
percpu_rwsem_assert_held(&cpuset_rwsem);
/*
- * We have raced with CPU hotplug. Don't do anything to avoid
+ * If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
- * Anyways, hotplug work item will rebuild sched domains.
+ * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+ *
+ * With no CPUs in any subpartitions, top_cpuset's effective CPUs
+ * should be the same as the active CPUs, so checking only top_cpuset
+ * is enough to detect racing CPU offlines.
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
- if (top_cpuset.nr_subparts_cpus &&
- !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /*
+ * With subpartition CPUs, however, the effective CPUs of a partition
+ * root should be only a subset of the active CPUs. Since a CPU in any
+ * partition root could be offlined, all must be checked.
+ */
+ if (top_cpuset.nr_subparts_cpus) {
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (!is_partition_valid(cs)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ if (!cpumask_subset(cs->effective_cpus,
+ cpu_active_mask)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+ }
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
@@ -1013,11 +1191,11 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
rebuild_sched_domains_locked();
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
}
/**
@@ -1025,17 +1203,25 @@ void rebuild_sched_domains(void)
* @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_cpumask(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ bool top_cs = cs == &top_cpuset;
css_task_iter_start(&cs->css, 0, &it);
- while ((task = css_task_iter_next(&it)))
+ while ((task = css_task_iter_next(&it))) {
+ /*
+ * Percpu kthreads in top_cpuset are ignored
+ */
+ if (top_cs && (task->flags & PF_KTHREAD) &&
+ kthread_is_per_cpu(task))
+ continue;
set_cpus_allowed_ptr(task, cs->effective_cpus);
+ }
css_task_iter_end(&it);
}
@@ -1070,15 +1256,18 @@ enum subparts_cmd {
partcmd_enable, /* Enable partition root */
partcmd_disable, /* Disable partition root */
partcmd_update, /* Update parent's subparts_cpus */
+ partcmd_invalidate, /* Make partition invalid */
};
+static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
+ int turning_on);
/**
* update_parent_subparts_cpumask - update subparts_cpus mask of parent cpuset
* @cpuset: The cpuset that requests change in partition root state
* @cmd: Partition root state change command
* @newmask: Optional new cpumask for partcmd_update
* @tmp: Temporary addmask and delmask
- * Return: 0, 1 or an error code
+ * Return: 0 or a partition root state error code
*
* For partcmd_enable, the cpuset is being transformed from a non-partition
* root to a partition root. The cpus_allowed mask of the given cpuset will
@@ -1086,42 +1275,39 @@ enum subparts_cmd {
* effective_cpus. The function will return 0 if all the CPUs listed in
* cpus_allowed can be granted or an error code will be returned.
*
- * For partcmd_disable, the cpuset is being transofrmed from a partition
- * root back to a non-partition root. any CPUs in cpus_allowed that are in
+ * For partcmd_disable, the cpuset is being transformed from a partition
+ * root back to a non-partition root. Any CPUs in cpus_allowed that are in
* parent's subparts_cpus will be taken away from that cpumask and put back
- * into parent's effective_cpus. 0 should always be returned.
+ * into parent's effective_cpus. 0 will always be returned.
+ *
+ * For partcmd_update, if the optional newmask is specified, the cpu list is
+ * to be changed from cpus_allowed to newmask. Otherwise, cpus_allowed is
+ * assumed to remain the same. The cpuset should either be a valid or invalid
+ * partition root. The partition root state may change from valid to invalid
+ * or vice versa. An error code will only be returned if transitioning from
+ * invalid to valid violates the exclusivity rule.
*
- * For partcmd_update, if the optional newmask is specified, the cpu
- * list is to be changed from cpus_allowed to newmask. Otherwise,
- * cpus_allowed is assumed to remain the same. The cpuset should either
- * be a partition root or an invalid partition root. The partition root
- * state may change if newmask is NULL and none of the requested CPUs can
- * be granted by the parent. The function will return 1 if changes to
- * parent's subparts_cpus and effective_cpus happen or 0 otherwise.
- * Error code should only be returned when newmask is non-NULL.
+ * For partcmd_invalidate, the current partition will be made invalid.
*
* The partcmd_enable and partcmd_disable commands are used by
- * update_prstate(). The partcmd_update command is used by
- * update_cpumasks_hier() with newmask NULL and update_cpumask() with
- * newmask set.
- *
- * The checking is more strict when enabling partition root than the
- * other two commands.
- *
- * Because of the implicit cpu exclusive nature of a partition root,
- * cpumask changes that violates the cpu exclusivity rule will not be
- * permitted when checked by validate_change(). The validate_change()
- * function will also prevent any changes to the cpu list if it is not
- * a superset of children's cpu lists.
+ * update_prstate(). An error code may be returned and the caller will check
+ * for error.
+ *
+ * The partcmd_update command is used by update_cpumasks_hier() with newmask
+ * NULL and update_cpumask() with newmask set. The partcmd_invalidate is used
+ * by update_cpumask() with NULL newmask. In both cases, the callers won't
+ * check for error and so partition_root_state and prs_error will be updated
+ * directly.
*/
-static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
+static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
struct cpumask *newmask,
struct tmpmasks *tmp)
{
- struct cpuset *parent = parent_cs(cpuset);
+ struct cpuset *parent = parent_cs(cs);
int adding; /* Moving cpus from effective_cpus to subparts_cpus */
int deleting; /* Moving cpus from subparts_cpus to effective_cpus */
- bool part_error = false; /* Partition error? */
+ int old_prs, new_prs;
+ int part_error = PERR_NONE; /* Partition error? */
percpu_rwsem_assert_held(&cpuset_rwsem);
@@ -1130,124 +1316,164 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
* The new cpumask, if present, or the current cpus_allowed must
* not be empty.
*/
- if (!is_partition_root(parent) ||
- (newmask && cpumask_empty(newmask)) ||
- (!newmask && cpumask_empty(cpuset->cpus_allowed)))
- return -EINVAL;
-
- /*
- * Enabling/disabling partition root is not allowed if there are
- * online children.
- */
- if ((cmd != partcmd_update) && css_has_online_children(&cpuset->css))
- return -EBUSY;
-
- /*
- * Enabling partition root is not allowed if not all the CPUs
- * can be granted from parent's effective_cpus or at least one
- * CPU will be left after that.
- */
- if ((cmd == partcmd_enable) &&
- (!cpumask_subset(cpuset->cpus_allowed, parent->effective_cpus) ||
- cpumask_equal(cpuset->cpus_allowed, parent->effective_cpus)))
- return -EINVAL;
+ if (!is_partition_valid(parent)) {
+ return is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART;
+ }
+ if ((newmask && cpumask_empty(newmask)) ||
+ (!newmask && cpumask_empty(cs->cpus_allowed)))
+ return PERR_CPUSEMPTY;
/*
- * A cpumask update cannot make parent's effective_cpus become empty.
+ * new_prs will only be changed for the partcmd_update and
+ * partcmd_invalidate commands.
*/
adding = deleting = false;
+ old_prs = new_prs = cs->partition_root_state;
if (cmd == partcmd_enable) {
- cpumask_copy(tmp->addmask, cpuset->cpus_allowed);
+ /*
+ * Enabling partition root is not allowed if cpus_allowed
+ * doesn't overlap parent's cpus_allowed.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->cpus_allowed))
+ return PERR_INVCPUS;
+
+ /*
+ * A parent can be left with no CPU as long as there is no
+ * task directly associated with the parent partition.
+ */
+ if (!cpumask_intersects(cs->cpus_allowed, parent->effective_cpus) &&
+ partition_is_populated(parent, cs))
+ return PERR_NOCPUS;
+
+ cpumask_copy(tmp->addmask, cs->cpus_allowed);
adding = true;
} else if (cmd == partcmd_disable) {
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
+ /*
+ * Need to remove cpus from parent's subparts_cpus for valid
+ * partition root.
+ */
+ deleting = !is_prs_invalid(old_prs) &&
+ cpumask_and(tmp->delmask, cs->cpus_allowed,
parent->subparts_cpus);
+ } else if (cmd == partcmd_invalidate) {
+ if (is_prs_invalid(old_prs))
+ return 0;
+
+ /*
+ * Make the current partition invalid. It is assumed that
+ * invalidation is caused by violating cpu exclusivity rule.
+ */
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
+ if (old_prs > 0) {
+ new_prs = -old_prs;
+ part_error = PERR_NOTEXCL;
+ }
} else if (newmask) {
/*
* partcmd_update with newmask:
*
+ * Compute add/delete mask to/from subparts_cpus
+ *
* delmask = cpus_allowed & ~newmask & parent->subparts_cpus
- * addmask = newmask & parent->effective_cpus
+ * addmask = newmask & parent->cpus_allowed
* & ~parent->subparts_cpus
*/
- cpumask_andnot(tmp->delmask, cpuset->cpus_allowed, newmask);
+ cpumask_andnot(tmp->delmask, cs->cpus_allowed, newmask);
deleting = cpumask_and(tmp->delmask, tmp->delmask,
parent->subparts_cpus);
- cpumask_and(tmp->addmask, newmask, parent->effective_cpus);
+ cpumask_and(tmp->addmask, newmask, parent->cpus_allowed);
adding = cpumask_andnot(tmp->addmask, tmp->addmask,
parent->subparts_cpus);
/*
- * Return error if the new effective_cpus could become empty.
+ * Make partition invalid if parent's effective_cpus could
+ * become empty and there are tasks in the parent.
*/
if (adding &&
- cpumask_equal(parent->effective_cpus, tmp->addmask)) {
- if (!deleting)
- return -EINVAL;
- /*
- * As some of the CPUs in subparts_cpus might have
- * been offlined, we need to compute the real delmask
- * to confirm that.
- */
- if (!cpumask_and(tmp->addmask, tmp->delmask,
- cpu_active_mask))
- return -EINVAL;
- cpumask_copy(tmp->addmask, parent->effective_cpus);
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ !cpumask_intersects(tmp->delmask, cpu_active_mask) &&
+ partition_is_populated(parent, cs)) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
} else {
/*
* partcmd_update w/o newmask:
*
- * addmask = cpus_allowed & parent->effectiveb_cpus
+ * delmask = cpus_allowed & parent->subparts_cpus
+ * addmask = cpus_allowed & parent->cpus_allowed
+ * & ~parent->subparts_cpus
*
- * Note that parent's subparts_cpus may have been
- * pre-shrunk in case there is a change in the cpu list.
- * So no deletion is needed.
+ * This gets invoked either due to a hotplug event or from
+ * update_cpumasks_hier(). This can cause the state of a
+ * partition root to transition from valid to invalid or vice
+ * versa. So we still need to compute the addmask and delmask.
+
+ * A partition error happens when:
+ * 1) Cpuset is valid partition, but parent does not distribute
+ * out any CPUs.
+ * 2) Parent has tasks and all its effective CPUs will have
+ * to be distributed out.
*/
- adding = cpumask_and(tmp->addmask, cpuset->cpus_allowed,
- parent->effective_cpus);
- part_error = cpumask_equal(tmp->addmask,
- parent->effective_cpus);
+ cpumask_and(tmp->addmask, cs->cpus_allowed,
+ parent->cpus_allowed);
+ adding = cpumask_andnot(tmp->addmask, tmp->addmask,
+ parent->subparts_cpus);
+
+ if ((is_partition_valid(cs) && !parent->nr_subparts_cpus) ||
+ (adding &&
+ cpumask_subset(parent->effective_cpus, tmp->addmask) &&
+ partition_is_populated(parent, cs))) {
+ part_error = PERR_NOCPUS;
+ adding = false;
+ }
+
+ if (part_error && is_partition_valid(cs) &&
+ parent->nr_subparts_cpus)
+ deleting = cpumask_and(tmp->delmask, cs->cpus_allowed,
+ parent->subparts_cpus);
}
+ if (part_error)
+ WRITE_ONCE(cs->prs_err, part_error);
if (cmd == partcmd_update) {
- int prev_prs = cpuset->partition_root_state;
-
/*
- * Check for possible transition between PRS_ENABLED
- * and PRS_ERROR.
+ * Check for possible transition between valid and invalid
+ * partition root.
*/
- switch (cpuset->partition_root_state) {
- case PRS_ENABLED:
+ switch (cs->partition_root_state) {
+ case PRS_ROOT:
+ case PRS_ISOLATED:
if (part_error)
- cpuset->partition_root_state = PRS_ERROR;
+ new_prs = -old_prs;
break;
- case PRS_ERROR:
+ case PRS_INVALID_ROOT:
+ case PRS_INVALID_ISOLATED:
if (!part_error)
- cpuset->partition_root_state = PRS_ENABLED;
+ new_prs = -old_prs;
break;
}
- /*
- * Set part_error if previously in invalid state.
- */
- part_error = (prev_prs == PRS_ERROR);
}
- if (!part_error && (cpuset->partition_root_state == PRS_ERROR))
- return 0; /* Nothing need to be done */
+ if (!adding && !deleting && (new_prs == old_prs))
+ return 0;
- if (cpuset->partition_root_state == PRS_ERROR) {
- /*
- * Remove all its cpus from parent's subparts_cpus.
- */
- adding = false;
- deleting = cpumask_and(tmp->delmask, cpuset->cpus_allowed,
- parent->subparts_cpus);
+ /*
+ * Transitioning between invalid to valid or vice versa may require
+ * changing CS_CPU_EXCLUSIVE and CS_SCHED_LOAD_BALANCE.
+ */
+ if (old_prs != new_prs) {
+ if (is_prs_invalid(old_prs) && !is_cpu_exclusive(cs) &&
+ (update_flag(CS_CPU_EXCLUSIVE, cs, 1) < 0))
+ return PERR_NOTEXCL;
+ if (is_prs_invalid(new_prs) && is_cpu_exclusive(cs))
+ update_flag(CS_CPU_EXCLUSIVE, cs, 0);
}
- if (!adding && !deleting)
- return 0;
-
/*
* Change the parent's subparts_cpus.
* Newly added CPUs will be removed from effective_cpus and
@@ -1272,40 +1498,68 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
}
parent->nr_subparts_cpus = cpumask_weight(parent->subparts_cpus);
+
+ if (old_prs != new_prs)
+ cs->partition_root_state = new_prs;
+
spin_unlock_irq(&callback_lock);
- return cmd == partcmd_update;
+ if (adding || deleting)
+ update_tasks_cpumask(parent);
+
+ /*
+ * Set or clear CS_SCHED_LOAD_BALANCE when partcmd_update, if necessary.
+ * rebuild_sched_domains_locked() may be called.
+ */
+ if (old_prs != new_prs) {
+ if (old_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ else if (new_prs == PRS_ISOLATED)
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ }
+ notify_partition_change(cs, old_prs);
+ return 0;
}
/*
* update_cpumasks_hier - Update effective cpumasks and tasks in the subtree
* @cs: the cpuset to consider
* @tmp: temp variables for calculating effective_cpus & partition setup
+ * @force: don't skip any descendant cpusets if set
*
- * When congifured cpumask is changed, the effective cpumasks of this cpuset
+ * When configured cpumask is changed, the effective cpumasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
+ * On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
-static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
+static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
+ bool force)
{
struct cpuset *cp;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
+ int old_prs, new_prs;
rcu_read_lock();
cpuset_for_each_descendant_pre(cp, pos_css, cs) {
struct cpuset *parent = parent_cs(cp);
+ bool update_parent = false;
compute_effective_cpumask(tmp->new_cpus, cp, parent);
/*
* If it becomes empty, inherit the effective mask of the
- * parent, which is guaranteed to have some CPUs.
+ * parent, which is guaranteed to have some CPUs unless
+ * it is a partition root that has explicitly distributed
+ * out all its CPUs.
*/
if (is_in_v2_mode() && cpumask_empty(tmp->new_cpus)) {
+ if (is_partition_valid(cp) &&
+ cpumask_equal(cp->cpus_allowed, cp->subparts_cpus))
+ goto update_parent_subparts;
+
cpumask_copy(tmp->new_cpus, parent->effective_cpus);
if (!cp->use_parent_ecpus) {
cp->use_parent_ecpus = true;
@@ -1319,56 +1573,40 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
/*
* Skip the whole subtree if the cpumask remains the same
- * and has no partition root state.
+ * and has no partition root state and force flag not set.
*/
- if (!cp->partition_root_state &&
+ if (!cp->partition_root_state && !force &&
cpumask_equal(tmp->new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
continue;
}
+update_parent_subparts:
/*
* update_parent_subparts_cpumask() should have been called
* for cs already in update_cpumask(). We should also call
* update_tasks_cpumask() again for tasks in the parent
* cpuset if the parent's subparts_cpus changes.
*/
- if ((cp != cs) && cp->partition_root_state) {
+ old_prs = new_prs = cp->partition_root_state;
+ if ((cp != cs) && old_prs) {
switch (parent->partition_root_state) {
- case PRS_DISABLED:
- /*
- * If parent is not a partition root or an
- * invalid partition root, clear the state
- * state and the CS_CPU_EXCLUSIVE flag.
- */
- WARN_ON_ONCE(cp->partition_root_state
- != PRS_ERROR);
- cp->partition_root_state = 0;
-
- /*
- * clear_bit() is an atomic operation and
- * readers aren't interested in the state
- * of CS_CPU_EXCLUSIVE anyway. So we can
- * just update the flag without holding
- * the callback_lock.
- */
- clear_bit(CS_CPU_EXCLUSIVE, &cp->flags);
- break;
-
- case PRS_ENABLED:
- if (update_parent_subparts_cpumask(cp, partcmd_update, NULL, tmp))
- update_tasks_cpumask(parent);
+ case PRS_ROOT:
+ case PRS_ISOLATED:
+ update_parent = true;
break;
- case PRS_ERROR:
+ default:
/*
- * When parent is invalid, it has to be too.
+ * When parent is not a partition root or is
+ * invalid, child partition roots become
+ * invalid too.
*/
- cp->partition_root_state = PRS_ERROR;
- if (cp->nr_subparts_cpus) {
- cp->nr_subparts_cpus = 0;
- cpumask_clear(cp->subparts_cpus);
- }
+ if (is_partition_valid(cp))
+ new_prs = -cp->partition_root_state;
+ WRITE_ONCE(cp->prs_err,
+ is_partition_invalid(parent)
+ ? PERR_INVPARENT : PERR_NOTPART);
break;
}
}
@@ -1377,39 +1615,45 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
continue;
rcu_read_unlock();
+ if (update_parent) {
+ update_parent_subparts_cpumask(cp, partcmd_update, NULL,
+ tmp);
+ /*
+ * The cpuset partition_root_state may become
+ * invalid. Capture it.
+ */
+ new_prs = cp->partition_root_state;
+ }
+
spin_lock_irq(&callback_lock);
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- if (cp->nr_subparts_cpus &&
- (cp->partition_root_state != PRS_ENABLED)) {
+ if (cp->nr_subparts_cpus && !is_partition_valid(cp)) {
+ /*
+ * Put all active subparts_cpus back to effective_cpus.
+ */
+ cpumask_or(tmp->new_cpus, tmp->new_cpus,
+ cp->subparts_cpus);
+ cpumask_and(tmp->new_cpus, tmp->new_cpus,
+ cpu_active_mask);
cp->nr_subparts_cpus = 0;
cpumask_clear(cp->subparts_cpus);
- } else if (cp->nr_subparts_cpus) {
+ }
+
+ cpumask_copy(cp->effective_cpus, tmp->new_cpus);
+ if (cp->nr_subparts_cpus) {
/*
* Make sure that effective_cpus & subparts_cpus
* are mutually exclusive.
- *
- * In the unlikely event that effective_cpus
- * becomes empty. we clear cp->nr_subparts_cpus and
- * let its child partition roots to compete for
- * CPUs again.
*/
cpumask_andnot(cp->effective_cpus, cp->effective_cpus,
cp->subparts_cpus);
- if (cpumask_empty(cp->effective_cpus)) {
- cpumask_copy(cp->effective_cpus, tmp->new_cpus);
- cpumask_clear(cp->subparts_cpus);
- cp->nr_subparts_cpus = 0;
- } else if (!cpumask_subset(cp->subparts_cpus,
- tmp->new_cpus)) {
- cpumask_andnot(cp->subparts_cpus,
- cp->subparts_cpus, tmp->new_cpus);
- cp->nr_subparts_cpus
- = cpumask_weight(cp->subparts_cpus);
- }
}
+
+ cp->partition_root_state = new_prs;
spin_unlock_irq(&callback_lock);
+ notify_partition_change(cp, old_prs);
+
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
@@ -1424,7 +1668,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp)
if (!cpumask_empty(cp->cpus_allowed) &&
is_sched_load_balance(cp) &&
(!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) ||
- is_partition_root(cp)))
+ is_partition_valid(cp)))
need_rebuild_sched_domains = true;
rcu_read_lock();
@@ -1448,10 +1692,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1459,8 +1708,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
if (!sibling->use_parent_ecpus)
continue;
+ if (!css_tryget_online(&sibling->css))
+ continue;
- update_cpumasks_hier(sibling, tmp);
+ rcu_read_unlock();
+ update_cpumasks_hier(sibling, tmp, false);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1476,6 +1730,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
{
int retval;
struct tmpmasks tmp;
+ bool invalidate = false;
/* top_cpuset.cpus_allowed tracks cpu_online_mask; it's read-only */
if (cs == &top_cpuset)
@@ -1503,10 +1758,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
return 0;
- retval = validate_change(cs, trialcs);
- if (retval < 0)
- return retval;
-
#ifdef CONFIG_CPUMASK_OFFSTACK
/*
* Use the cpumasks in trialcs for tmpmasks when they are pointers
@@ -1517,29 +1768,70 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
tmp.new_cpus = trialcs->cpus_allowed;
#endif
+ retval = validate_change(cs, trialcs);
+
+ if ((retval == -EINVAL) && cgroup_subsys_on_dfl(cpuset_cgrp_subsys)) {
+ struct cpuset *cp, *parent;
+ struct cgroup_subsys_state *css;
+
+ /*
+ * The -EINVAL error code indicates that partition sibling
+ * CPU exclusivity rule has been violated. We still allow
+ * the cpumask change to proceed while invalidating the
+ * partition. However, any conflicting sibling partitions
+ * have to be marked as invalid too.
+ */
+ invalidate = true;
+ rcu_read_lock();
+ parent = parent_cs(cs);
+ cpuset_for_each_child(cp, css, parent)
+ if (is_partition_valid(cp) &&
+ cpumask_intersects(trialcs->cpus_allowed, cp->cpus_allowed)) {
+ rcu_read_unlock();
+ update_parent_subparts_cpumask(cp, partcmd_invalidate, NULL, &tmp);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+ retval = 0;
+ }
+ if (retval < 0)
+ return retval;
+
if (cs->partition_root_state) {
- /* Cpumask of a partition root cannot be empty */
- if (cpumask_empty(trialcs->cpus_allowed))
- return -EINVAL;
- if (update_parent_subparts_cpumask(cs, partcmd_update,
- trialcs->cpus_allowed, &tmp) < 0)
- return -EINVAL;
+ if (invalidate)
+ update_parent_subparts_cpumask(cs, partcmd_invalidate,
+ NULL, &tmp);
+ else
+ update_parent_subparts_cpumask(cs, partcmd_update,
+ trialcs->cpus_allowed, &tmp);
}
+ compute_effective_cpumask(trialcs->effective_cpus, trialcs,
+ parent_cs(cs));
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
/*
- * Make sure that subparts_cpus is a subset of cpus_allowed.
+ * Make sure that subparts_cpus, if not empty, is a subset of
+ * cpus_allowed. Clear subparts_cpus if partition not valid or
+ * empty effective cpus with tasks.
*/
if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
- cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ if (!is_partition_valid(cs) ||
+ (cpumask_subset(trialcs->effective_cpus, cs->subparts_cpus) &&
+ partition_is_populated(cs, NULL))) {
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ } else {
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus,
+ cs->cpus_allowed);
+ cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
+ }
}
spin_unlock_irq(&callback_lock);
- update_cpumasks_hier(cs, &tmp);
+ /* effective_cpus will be updated here */
+ update_cpumasks_hier(cs, &tmp, false);
if (cs->partition_root_state) {
struct cpuset *parent = parent_cs(cs);
@@ -1585,6 +1877,11 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
{
struct cpuset_migrate_mm_work *mwork;
+ if (nodes_equal(*from, *to)) {
+ mmput(mm);
+ return;
+ }
+
mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
if (mwork) {
mwork->mm = mm;
@@ -1637,12 +1934,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_mutex held,
+ * effective cpuset's. As this function is called with cpuset_rwsem held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_mutex */
+ static nodemask_t newmems; /* protected by cpuset_rwsem */
struct css_task_iter it;
struct task_struct *task;
@@ -1651,11 +1948,11 @@ static void update_tasks_nodemask(struct cpuset *cs)
guarantee_online_mems(cs, &newmems);
/*
- * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+ * The mpol_rebind_mm() call takes mmap_lock, which we couldn't
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_mutex, we know that no other rebind effort
+ * the global cpuset_rwsem, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -1699,9 +1996,9 @@ static void update_tasks_nodemask(struct cpuset *cs)
* When configured nodemask is changed, the effective nodemasks of this cpuset
* and all its descendants need to be updated.
*
- * On legacy hiearchy, effective_mems will be the same with mems_allowed.
+ * On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_mutex held
+ * Called with cpuset_rwsem held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -1754,9 +2051,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_mutex held. May take callback_lock during call.
+ * Call with cpuset_rwsem held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
- * lock each such tasks mm->mmap_sem, scan its vma's and rebind
+ * lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
*/
static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
@@ -1801,6 +2098,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
+ check_insane_mems_config(&trialcs->mems_allowed);
+
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@@ -1844,7 +2143,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_mutex held, cpuset membership stays
+ * function is called with cpuset_rwsem held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -1864,7 +2163,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -1908,92 +2207,130 @@ out:
return err;
}
-/*
- * update_prstate - update partititon_root_state
- * cs: the cpuset to update
- * val: 0 - disabled, 1 - enabled
+/**
+ * update_prstate - update partition_root_state
+ * @cs: the cpuset to update
+ * @new_prs: new partition root state
+ * Return: 0 if successful, != 0 if error
*
- * Call with cpuset_mutex held.
+ * Call with cpuset_rwsem held.
*/
-static int update_prstate(struct cpuset *cs, int val)
+static int update_prstate(struct cpuset *cs, int new_prs)
{
- int err;
+ int err = PERR_NONE, old_prs = cs->partition_root_state;
+ bool sched_domain_rebuilt = false;
struct cpuset *parent = parent_cs(cs);
- struct tmpmasks tmp;
+ struct tmpmasks tmpmask;
- if ((val != 0) && (val != 1))
- return -EINVAL;
- if (val == cs->partition_root_state)
+ if (old_prs == new_prs)
return 0;
/*
- * Cannot force a partial or invalid partition root to a full
- * partition root.
+ * For a previously invalid partition root, leave it at being
+ * invalid if new_prs is not "member".
*/
- if (val && cs->partition_root_state)
- return -EINVAL;
+ if (new_prs && is_prs_invalid(old_prs)) {
+ cs->partition_root_state = -new_prs;
+ return 0;
+ }
- if (alloc_cpumasks(NULL, &tmp))
+ if (alloc_cpumasks(NULL, &tmpmask))
return -ENOMEM;
- err = -EINVAL;
- if (!cs->partition_root_state) {
+ if (!old_prs) {
/*
* Turning on partition root requires setting the
* CS_CPU_EXCLUSIVE bit implicitly as well and cpus_allowed
- * cannot be NULL.
+ * cannot be empty.
*/
- if (cpumask_empty(cs->cpus_allowed))
+ if (cpumask_empty(cs->cpus_allowed)) {
+ err = PERR_CPUSEMPTY;
goto out;
+ }
err = update_flag(CS_CPU_EXCLUSIVE, cs, 1);
- if (err)
+ if (err) {
+ err = PERR_NOTEXCL;
goto out;
+ }
err = update_parent_subparts_cpumask(cs, partcmd_enable,
- NULL, &tmp);
+ NULL, &tmpmask);
if (err) {
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
goto out;
}
- cs->partition_root_state = PRS_ENABLED;
+
+ if (new_prs == PRS_ISOLATED) {
+ /*
+ * Disable the load balance flag should not return an
+ * error unless the system is running out of memory.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 0);
+ sched_domain_rebuilt = true;
+ }
+ } else if (old_prs && new_prs) {
+ /*
+ * A change in load balance state only, no change in cpumasks.
+ */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, (new_prs != PRS_ISOLATED));
+ sched_domain_rebuilt = true;
+ goto out; /* Sched domain is rebuilt in update_flag() */
} else {
/*
- * Turning off partition root will clear the
- * CS_CPU_EXCLUSIVE bit.
+ * Switching back to member is always allowed even if it
+ * disables child partitions.
*/
- if (cs->partition_root_state == PRS_ERROR) {
- cs->partition_root_state = 0;
- update_flag(CS_CPU_EXCLUSIVE, cs, 0);
- err = 0;
- goto out;
- }
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL,
+ &tmpmask);
- err = update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, &tmp);
- if (err)
- goto out;
-
- cs->partition_root_state = 0;
+ /*
+ * If there are child partitions, they will all become invalid.
+ */
+ if (unlikely(cs->nr_subparts_cpus)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ compute_effective_cpumask(cs->effective_cpus, cs, parent);
+ spin_unlock_irq(&callback_lock);
+ }
/* Turning off CS_CPU_EXCLUSIVE will not return error */
update_flag(CS_CPU_EXCLUSIVE, cs, 0);
+
+ if (!is_sched_load_balance(cs)) {
+ /* Make sure load balance is on */
+ update_flag(CS_SCHED_LOAD_BALANCE, cs, 1);
+ sched_domain_rebuilt = true;
+ }
}
- /*
- * Update cpumask of parent's tasks except when it is the top
- * cpuset as some system daemons cannot be mapped to other CPUs.
- */
- if (parent != &top_cpuset)
- update_tasks_cpumask(parent);
+ update_tasks_cpumask(parent);
if (parent->child_ecpus_count)
- update_sibling_cpumasks(parent, cs, &tmp);
+ update_sibling_cpumasks(parent, cs, &tmpmask);
- rebuild_sched_domains_locked();
+ if (!sched_domain_rebuilt)
+ rebuild_sched_domains_locked();
out:
- free_cpumasks(NULL, &tmp);
- return err;
+ /*
+ * Make partition invalid if an error happen
+ */
+ if (err)
+ new_prs = -new_prs;
+ spin_lock_irq(&callback_lock);
+ cs->partition_root_state = new_prs;
+ spin_unlock_irq(&callback_lock);
+ /*
+ * Update child cpusets, if present.
+ * Force update if switching back to member.
+ */
+ if (!list_empty(&cs->css.children))
+ update_cpumasks_hier(cs, &tmpmask, !new_prs);
+
+ notify_partition_change(cs, old_prs);
+ free_cpumasks(NULL, &tmpmask);
+ return 0;
}
/*
@@ -2099,7 +2436,7 @@ static int fmeter_getrate(struct fmeter *fmp)
static struct cpuset *cpuset_attach_old_cs;
-/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
+/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -2119,8 +2456,14 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
(cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)))
goto out_unlock;
+ /*
+ * Task cannot be moved to a cpuset with empty effective cpus.
+ */
+ if (cpumask_empty(cs->effective_cpus))
+ goto out_unlock;
+
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->cpus_allowed);
+ ret = task_can_attach(task, cs->effective_cpus);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
@@ -2151,7 +2494,7 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
}
/*
- * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach()
+ * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2159,7 +2502,7 @@ static cpumask_var_t cpus_attach;
static void cpuset_attach(struct cgroup_taskset *tset)
{
- /* static buf protected by cpuset_mutex */
+ /* static buf protected by cpuset_rwsem */
static nodemask_t cpuset_attach_nodemask_to;
struct task_struct *task;
struct task_struct *leader;
@@ -2170,17 +2513,16 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
percpu_down_write(&cpuset_rwsem);
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) {
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -2255,7 +2597,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -2293,7 +2635,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -2304,7 +2646,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = -ENODEV;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2319,7 +2661,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
}
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return retval;
}
@@ -2351,14 +2693,14 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_mutex anyway. This only happens on the legacy
+ * grabbing cpuset_rwsem anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
kernfs_break_active_protection(of->kn);
flush_work(&cpuset_hotplug_work);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2384,7 +2726,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
flush_workqueue(cpuset_migrate_mm_wq);
@@ -2473,23 +2815,36 @@ static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft)
BUG();
}
- /* Unrechable but makes gcc happy */
+ /* Unreachable but makes gcc happy */
return 0;
}
static int sched_partition_show(struct seq_file *seq, void *v)
{
struct cpuset *cs = css_cs(seq_css(seq));
+ const char *err, *type = NULL;
switch (cs->partition_root_state) {
- case PRS_ENABLED:
+ case PRS_ROOT:
seq_puts(seq, "root\n");
break;
- case PRS_DISABLED:
+ case PRS_ISOLATED:
+ seq_puts(seq, "isolated\n");
+ break;
+ case PRS_MEMBER:
seq_puts(seq, "member\n");
break;
- case PRS_ERROR:
- seq_puts(seq, "root invalid\n");
+ case PRS_INVALID_ROOT:
+ type = "root";
+ fallthrough;
+ case PRS_INVALID_ISOLATED:
+ if (!type)
+ type = "isolated";
+ err = perr_strings[READ_ONCE(cs->prs_err)];
+ if (err)
+ seq_printf(seq, "%s invalid (%s)\n", type, err);
+ else
+ seq_printf(seq, "%s invalid\n", type);
break;
}
return 0;
@@ -2508,14 +2863,16 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
* Convert "root" to ENABLED, and convert "member" to DISABLED.
*/
if (!strcmp(buf, "root"))
- val = PRS_ENABLED;
+ val = PRS_ROOT;
else if (!strcmp(buf, "member"))
- val = PRS_DISABLED;
+ val = PRS_MEMBER;
+ else if (!strcmp(buf, "isolated"))
+ val = PRS_ISOLATED;
else
return -EINVAL;
css_get(&cs->css);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2523,7 +2880,7 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
retval = update_prstate(cs, val);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
}
@@ -2675,6 +3032,7 @@ static struct cftype dfl_files[] = {
.write = sched_partition_write,
.private = FILE_PARTITION_ROOT,
.flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct cpuset, partition_file),
},
{
@@ -2710,12 +3068,16 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
return ERR_PTR(-ENOMEM);
}
- set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
+ __set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
nodes_clear(cs->mems_allowed);
nodes_clear(cs->effective_mems);
fmeter_init(&cs->fmeter);
cs->relax_domain_level = -1;
+ /* Set CS_MEMORY_MIGRATE for default hierarchy */
+ if (cgroup_subsys_on_dfl(cpuset_cgrp_subsys))
+ __set_bit(CS_MEMORY_MIGRATE, &cs->flags);
+
return &cs->css;
}
@@ -2729,7 +3091,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
if (!parent)
return 0;
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
set_bit(CS_ONLINE, &cs->flags);
@@ -2755,7 +3117,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
/*
* Clone @parent's configuration if CGRP_CPUSET_CLONE_CHILDREN is
* set. This flag handling is implemented in cgroup core for
- * histrical reasons - the flag may be specified during mount.
+ * historical reasons - the flag may be specified during mount.
*
* Currently, if any sibling cpusets have exclusive cpus or mem, we
* refuse to clone the configuration - thereby refusing the task to
@@ -2782,7 +3144,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
spin_unlock_irq(&callback_lock);
out_unlock:
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
return 0;
}
@@ -2801,10 +3163,10 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
{
struct cpuset *cs = css_cs(css);
- get_online_cpus();
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
- if (is_partition_root(cs))
+ if (is_partition_valid(cs))
update_prstate(cs, 0);
if (!cgroup_subsys_on_dfl(cpuset_cgrp_subsys) &&
@@ -2822,7 +3184,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
clear_bit(CS_ONLINE, &cs->flags);
percpu_up_write(&cpuset_rwsem);
- put_online_cpus();
+ cpus_read_unlock();
}
static void cpuset_css_free(struct cgroup_subsys_state *css)
@@ -2952,7 +3314,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
/*
* Don't call update_tasks_cpumask() if the cpuset becomes empty,
- * as the tasks will be migratecd to an ancestor.
+ * as the tasks will be migrated to an ancestor.
*/
if (cpus_updated && !cpumask_empty(cs->cpus_allowed))
update_tasks_cpumask(cs);
@@ -2980,7 +3342,8 @@ hotplug_update_tasks(struct cpuset *cs,
struct cpumask *new_cpus, nodemask_t *new_mems,
bool cpus_updated, bool mems_updated)
{
- if (cpumask_empty(new_cpus))
+ /* A partition root is allowed to have empty effective cpus */
+ if (cpumask_empty(new_cpus) && !is_partition_valid(cs))
cpumask_copy(new_cpus, parent_cs(cs)->effective_cpus);
if (nodes_empty(*new_mems))
*new_mems = parent_cs(cs)->effective_mems;
@@ -3033,7 +3396,7 @@ retry:
goto retry;
}
- parent = parent_cs(cs);
+ parent = parent_cs(cs);
compute_effective_cpumask(&new_cpus, cs, parent);
nodes_and(new_mems, cs->mems_allowed, parent->effective_mems);
@@ -3049,47 +3412,73 @@ retry:
/*
* In the unlikely event that a partition root has empty
- * effective_cpus or its parent becomes erroneous, we have to
- * transition it to the erroneous state.
+ * effective_cpus with tasks, we will have to invalidate child
+ * partitions, if present, by setting nr_subparts_cpus to 0 to
+ * reclaim their cpus.
*/
- if (is_partition_root(cs) && (cpumask_empty(&new_cpus) ||
- (parent->partition_root_state == PRS_ERROR))) {
+ if (cs->nr_subparts_cpus && is_partition_valid(cs) &&
+ cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)) {
+ spin_lock_irq(&callback_lock);
+ cs->nr_subparts_cpus = 0;
+ cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
+ compute_effective_cpumask(&new_cpus, cs, parent);
+ }
+
+ /*
+ * Force the partition to become invalid if either one of
+ * the following conditions hold:
+ * 1) empty effective cpus but not valid empty partition.
+ * 2) parent is invalid or doesn't grant any cpus to child
+ * partitions.
+ */
+ if (is_partition_valid(cs) && (!parent->nr_subparts_cpus ||
+ (cpumask_empty(&new_cpus) && partition_is_populated(cs, NULL)))) {
+ int old_prs, parent_prs;
+
+ update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp);
if (cs->nr_subparts_cpus) {
+ spin_lock_irq(&callback_lock);
cs->nr_subparts_cpus = 0;
cpumask_clear(cs->subparts_cpus);
+ spin_unlock_irq(&callback_lock);
compute_effective_cpumask(&new_cpus, cs, parent);
}
- /*
- * If the effective_cpus is empty because the child
- * partitions take away all the CPUs, we can keep
- * the current partition and let the child partitions
- * fight for available CPUs.
- */
- if ((parent->partition_root_state == PRS_ERROR) ||
- cpumask_empty(&new_cpus)) {
- update_parent_subparts_cpumask(cs, partcmd_disable,
- NULL, tmp);
- cs->partition_root_state = PRS_ERROR;
+ old_prs = cs->partition_root_state;
+ parent_prs = parent->partition_root_state;
+ if (is_partition_valid(cs)) {
+ spin_lock_irq(&callback_lock);
+ make_partition_invalid(cs);
+ spin_unlock_irq(&callback_lock);
+ if (is_prs_invalid(parent_prs))
+ WRITE_ONCE(cs->prs_err, PERR_INVPARENT);
+ else if (!parent_prs)
+ WRITE_ONCE(cs->prs_err, PERR_NOTPART);
+ else
+ WRITE_ONCE(cs->prs_err, PERR_HOTPLUG);
+ notify_partition_change(cs, old_prs);
}
cpuset_force_rebuild();
}
/*
- * On the other hand, an erroneous partition root may be transitioned
- * back to a regular one or a partition root with no CPU allocated
- * from the parent may change to erroneous.
+ * On the other hand, an invalid partition root may be transitioned
+ * back to a regular one.
*/
- if (is_partition_root(parent) &&
- ((cs->partition_root_state == PRS_ERROR) ||
- !cpumask_intersects(&new_cpus, parent->subparts_cpus)) &&
- update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp))
- cpuset_force_rebuild();
+ else if (is_partition_valid(parent) && is_partition_invalid(cs)) {
+ update_parent_subparts_cpumask(cs, partcmd_update, NULL, tmp);
+ if (is_partition_valid(cs))
+ cpuset_force_rebuild();
+ }
update_tasks:
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
+ if (mems_updated)
+ check_insane_mems_config(&new_mems);
+
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
@@ -3141,6 +3530,13 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
cpus_updated = !cpumask_equal(top_cpuset.effective_cpus, &new_cpus);
mems_updated = !nodes_equal(top_cpuset.effective_mems, new_mems);
+ /*
+ * In the rare case that hotplug removes all the cpus in subparts_cpus,
+ * we assumed that cpus are updated.
+ */
+ if (!cpus_updated && top_cpuset.nr_subparts_cpus)
+ cpus_updated = true;
+
/* synchronize cpus_allowed to cpu_active_mask */
if (cpus_updated) {
spin_lock_irq(&callback_lock);
@@ -3246,8 +3642,11 @@ static struct notifier_block cpuset_track_online_nodes_nb = {
*/
void __init cpuset_init_smp(void)
{
- cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
- top_cpuset.mems_allowed = node_states[N_MEMORY];
+ /*
+ * cpus_allowd/mems_allowed set to v2 values in the initial
+ * cpuset_bind() call will be reset to v1 values in another
+ * cpuset_bind() call when v1 cpuset is mounted.
+ */
top_cpuset.old_mems_allowed = top_cpuset.mems_allowed;
cpumask_copy(top_cpuset.effective_cpus, cpu_active_mask);
@@ -3275,9 +3674,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
- rcu_read_unlock();
+ guarantee_online_cpus(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3291,13 +3688,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
**/
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
rcu_read_lock();
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ do_set_cpus_allowed(tsk, cs_mask);
+ changed = true;
+ }
rcu_read_unlock();
/*
@@ -3317,6 +3723,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
+ return changed;
}
void __init cpuset_init_current_mems_allowed(void)
@@ -3349,7 +3756,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
}
/**
- * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed
+ * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed
* @nodemask: the nodemask to be checked
*
* Are any of the nodes in the nodemask allowed in current->mems_allowed?
@@ -3372,8 +3779,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
-/**
- * cpuset_node_allowed - Can we allocate on a memory node?
+/*
+ * __cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@@ -3415,7 +3822,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
- int allowed; /* is allocation in zone z allowed? */
+ bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
@@ -3544,8 +3951,8 @@ void cpuset_print_current_mems_allowed(void)
int cpuset_memory_pressure_enabled __read_mostly;
-/**
- * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
+/*
+ * __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@@ -3560,7 +3967,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
- **/
+ */
void __cpuset_memory_pressure_bump(void)
{
@@ -3576,7 +3983,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_mutex, keeping cpuset_attach() from changing it
+ * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c
index 3984dd6b8ddb..617861a54793 100644
--- a/kernel/cgroup/freezer.c
+++ b/kernel/cgroup/freezer.c
@@ -1,4 +1,4 @@
-//SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0
#include <linux/cgroup.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c
index 08236798d173..1b6b21851e9d 100644
--- a/kernel/cgroup/legacy_freezer.c
+++ b/kernel/cgroup/legacy_freezer.c
@@ -113,7 +113,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
if (parent && (parent->state & CGROUP_FREEZING)) {
freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
}
mutex_unlock(&freezer_mutex);
@@ -134,7 +134,7 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
- atomic_dec(&system_freezing_cnt);
+ static_branch_dec(&freezer_active);
freezer->state = 0;
@@ -179,6 +179,7 @@ static void freezer_attach(struct cgroup_taskset *tset)
__thaw_task(task);
} else {
freeze_task(task);
+
/* clear FROZEN and propagate upwards */
while (freezer && (freezer->state & CGROUP_FROZEN)) {
freezer->state &= ~CGROUP_FROZEN;
@@ -271,16 +272,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
css_task_iter_start(css, 0, &it);
while ((task = css_task_iter_next(&it))) {
- if (freezing(task)) {
- /*
- * freezer_should_skip() indicates that the task
- * should be skipped when determining freezing
- * completion. Consider it frozen in addition to
- * the usual frozen condition.
- */
- if (!frozen(task) && !freezer_should_skip(task))
- goto out_iter_end;
- }
+ if (freezing(task) && !frozen(task))
+ goto out_iter_end;
}
freezer->state |= CGROUP_FROZEN;
@@ -357,7 +350,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
if (freeze) {
if (!(freezer->state & CGROUP_FREEZING))
- atomic_inc(&system_freezing_cnt);
+ static_branch_inc(&freezer_active);
freezer->state |= state;
freeze_cgroup(freezer);
} else {
@@ -366,9 +359,9 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
freezer->state &= ~state;
if (!(freezer->state & CGROUP_FREEZING)) {
- if (was_freezing)
- atomic_dec(&system_freezing_cnt);
freezer->state &= ~CGROUP_FROZEN;
+ if (was_freezing)
+ static_branch_dec(&freezer_active);
unfreeze_cgroup(freezer);
}
}
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
new file mode 100644
index 000000000000..fe3e8a0eb7ed
--- /dev/null
+++ b/kernel/cgroup/misc.c
@@ -0,0 +1,424 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Miscellaneous cgroup controller
+ *
+ * Copyright 2020 Google LLC
+ * Author: Vipin Sharma <vipinsh@google.com>
+ */
+
+#include <linux/limits.h>
+#include <linux/cgroup.h>
+#include <linux/errno.h>
+#include <linux/atomic.h>
+#include <linux/slab.h>
+#include <linux/misc_cgroup.h>
+
+#define MAX_STR "max"
+#define MAX_NUM ULONG_MAX
+
+/* Miscellaneous res name, keep it in sync with enum misc_res_type */
+static const char *const misc_res_name[] = {
+#ifdef CONFIG_KVM_AMD_SEV
+ /* AMD SEV ASIDs resource */
+ "sev",
+ /* AMD SEV-ES ASIDs resource */
+ "sev_es",
+#endif
+};
+
+/* Root misc cgroup */
+static struct misc_cg root_cg;
+
+/*
+ * Miscellaneous resources capacity for the entire machine. 0 capacity means
+ * resource is not initialized or not present in the host.
+ *
+ * root_cg.max and capacity are independent of each other. root_cg.max can be
+ * more than the actual capacity. We are using Limits resource distribution
+ * model of cgroup for miscellaneous controller.
+ */
+static unsigned long misc_res_capacity[MISC_CG_RES_TYPES];
+
+/**
+ * parent_misc() - Get the parent of the passed misc cgroup.
+ * @cgroup: cgroup whose parent needs to be fetched.
+ *
+ * Context: Any context.
+ * Return:
+ * * struct misc_cg* - Parent of the @cgroup.
+ * * %NULL - If @cgroup is null or the passed cgroup does not have a parent.
+ */
+static struct misc_cg *parent_misc(struct misc_cg *cgroup)
+{
+ return cgroup ? css_misc(cgroup->css.parent) : NULL;
+}
+
+/**
+ * valid_type() - Check if @type is valid or not.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return:
+ * * true - If valid type.
+ * * false - If not valid type.
+ */
+static inline bool valid_type(enum misc_res_type type)
+{
+ return type >= 0 && type < MISC_CG_RES_TYPES;
+}
+
+/**
+ * misc_cg_res_total_usage() - Get the current total usage of the resource.
+ * @type: misc res type.
+ *
+ * Context: Any context.
+ * Return: Current total usage of the resource.
+ */
+unsigned long misc_cg_res_total_usage(enum misc_res_type type)
+{
+ if (valid_type(type))
+ return atomic_long_read(&root_cg.res[type].usage);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_res_total_usage);
+
+/**
+ * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
+ * @type: Type of the misc res.
+ * @capacity: Supported capacity of the misc res on the host.
+ *
+ * If capacity is 0 then the charging a misc cgroup fails for that type.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - Successfully registered the capacity.
+ * * %-EINVAL - If @type is invalid.
+ */
+int misc_cg_set_capacity(enum misc_res_type type, unsigned long capacity)
+{
+ if (!valid_type(type))
+ return -EINVAL;
+
+ WRITE_ONCE(misc_res_capacity[type], capacity);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(misc_cg_set_capacity);
+
+/**
+ * misc_cg_cancel_charge() - Cancel the charge from the misc cgroup.
+ * @type: Misc res type in misc cg to cancel the charge from.
+ * @cg: Misc cgroup to cancel charge from.
+ * @amount: Amount to cancel.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_cancel_charge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount)
+{
+ WARN_ONCE(atomic_long_add_negative(-amount, &cg->res[type].usage),
+ "misc cgroup resource %s became less than 0",
+ misc_res_name[type]);
+}
+
+/**
+ * misc_cg_try_charge() - Try charging the misc cgroup.
+ * @type: Misc res type to charge.
+ * @cg: Misc cgroup which will be charged.
+ * @amount: Amount to charge.
+ *
+ * Charge @amount to the misc cgroup. Caller must use the same cgroup during
+ * the uncharge call.
+ *
+ * Context: Any context.
+ * Return:
+ * * %0 - If successfully charged.
+ * * -EINVAL - If @type is invalid or misc res has 0 capacity.
+ * * -EBUSY - If max limit will be crossed or total usage will be more than the
+ * capacity.
+ */
+int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount)
+{
+ struct misc_cg *i, *j;
+ int ret;
+ struct misc_res *res;
+ int new_usage;
+
+ if (!(valid_type(type) && cg && READ_ONCE(misc_res_capacity[type])))
+ return -EINVAL;
+
+ if (!amount)
+ return 0;
+
+ for (i = cg; i; i = parent_misc(i)) {
+ res = &i->res[type];
+
+ new_usage = atomic_long_add_return(amount, &res->usage);
+ if (new_usage > READ_ONCE(res->max) ||
+ new_usage > READ_ONCE(misc_res_capacity[type])) {
+ ret = -EBUSY;
+ goto err_charge;
+ }
+ }
+ return 0;
+
+err_charge:
+ for (j = i; j; j = parent_misc(j)) {
+ atomic_long_inc(&j->res[type].events);
+ cgroup_file_notify(&j->events_file);
+ }
+
+ for (j = cg; j != i; j = parent_misc(j))
+ misc_cg_cancel_charge(type, j, amount);
+ misc_cg_cancel_charge(type, i, amount);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(misc_cg_try_charge);
+
+/**
+ * misc_cg_uncharge() - Uncharge the misc cgroup.
+ * @type: Misc res type which was charged.
+ * @cg: Misc cgroup which will be uncharged.
+ * @amount: Charged amount.
+ *
+ * Context: Any context.
+ */
+void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg,
+ unsigned long amount)
+{
+ struct misc_cg *i;
+
+ if (!(amount && valid_type(type) && cg))
+ return;
+
+ for (i = cg; i; i = parent_misc(i))
+ misc_cg_cancel_charge(type, i, amount);
+}
+EXPORT_SYMBOL_GPL(misc_cg_uncharge);
+
+/**
+ * misc_cg_max_show() - Show the misc cgroup max limit.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_max_show(struct seq_file *sf, void *v)
+{
+ int i;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ unsigned long max;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (READ_ONCE(misc_res_capacity[i])) {
+ max = READ_ONCE(cg->res[i].max);
+ if (max == MAX_NUM)
+ seq_printf(sf, "%s max\n", misc_res_name[i]);
+ else
+ seq_printf(sf, "%s %lu\n", misc_res_name[i],
+ max);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_max_write() - Update the maximum limit of the cgroup.
+ * @of: Handler for the file.
+ * @buf: Data from the user. It should be either "max", 0, or a positive
+ * integer.
+ * @nbytes: Number of bytes of the data.
+ * @off: Offset in the file.
+ *
+ * User can pass data like:
+ * echo sev 23 > misc.max, OR
+ * echo sev max > misc.max
+ *
+ * Context: Any context.
+ * Return:
+ * * >= 0 - Number of bytes processed in the input.
+ * * -EINVAL - If buf is not valid.
+ * * -ERANGE - If number is bigger than the unsigned long capacity.
+ */
+static ssize_t misc_cg_max_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct misc_cg *cg;
+ unsigned long max;
+ int ret = 0, i;
+ enum misc_res_type type = MISC_CG_RES_TYPES;
+ char *token;
+
+ buf = strstrip(buf);
+ token = strsep(&buf, " ");
+
+ if (!token || !buf)
+ return -EINVAL;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ if (!strcmp(misc_res_name[i], token)) {
+ type = i;
+ break;
+ }
+ }
+
+ if (type == MISC_CG_RES_TYPES)
+ return -EINVAL;
+
+ if (!strcmp(MAX_STR, buf)) {
+ max = MAX_NUM;
+ } else {
+ ret = kstrtoul(buf, 0, &max);
+ if (ret)
+ return ret;
+ }
+
+ cg = css_misc(of_css(of));
+
+ if (READ_ONCE(misc_res_capacity[type]))
+ WRITE_ONCE(cg->res[type].max, max);
+ else
+ ret = -EINVAL;
+
+ return ret ? ret : nbytes;
+}
+
+/**
+ * misc_cg_current_show() - Show the current usage of the misc cgroup.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_current_show(struct seq_file *sf, void *v)
+{
+ int i;
+ unsigned long usage;
+ struct misc_cg *cg = css_misc(seq_css(sf));
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ usage = atomic_long_read(&cg->res[i].usage);
+ if (READ_ONCE(misc_res_capacity[i]) || usage)
+ seq_printf(sf, "%s %lu\n", misc_res_name[i], usage);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_capacity_show() - Show the total capacity of misc res on the host.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Only present in the root cgroup directory.
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_capacity_show(struct seq_file *sf, void *v)
+{
+ int i;
+ unsigned long cap;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ cap = READ_ONCE(misc_res_capacity[i]);
+ if (cap)
+ seq_printf(sf, "%s %lu\n", misc_res_name[i], cap);
+ }
+
+ return 0;
+}
+
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ unsigned long events, i;
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ events = atomic_long_read(&cg->res[i].events);
+ if (READ_ONCE(misc_res_capacity[i]) || events)
+ seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+ }
+ return 0;
+}
+
+/* Misc cgroup interface files */
+static struct cftype misc_cg_files[] = {
+ {
+ .name = "max",
+ .write = misc_cg_max_write,
+ .seq_show = misc_cg_max_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "current",
+ .seq_show = misc_cg_current_show,
+ .flags = CFTYPE_NOT_ON_ROOT,
+ },
+ {
+ .name = "capacity",
+ .seq_show = misc_cg_capacity_show,
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct misc_cg, events_file),
+ .seq_show = misc_events_show,
+ },
+ {}
+};
+
+/**
+ * misc_cg_alloc() - Allocate misc cgroup.
+ * @parent_css: Parent cgroup.
+ *
+ * Context: Process context.
+ * Return:
+ * * struct cgroup_subsys_state* - css of the allocated cgroup.
+ * * ERR_PTR(-ENOMEM) - No memory available to allocate.
+ */
+static struct cgroup_subsys_state *
+misc_cg_alloc(struct cgroup_subsys_state *parent_css)
+{
+ enum misc_res_type i;
+ struct misc_cg *cg;
+
+ if (!parent_css) {
+ cg = &root_cg;
+ } else {
+ cg = kzalloc(sizeof(*cg), GFP_KERNEL);
+ if (!cg)
+ return ERR_PTR(-ENOMEM);
+ }
+
+ for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+ WRITE_ONCE(cg->res[i].max, MAX_NUM);
+ atomic_long_set(&cg->res[i].usage, 0);
+ }
+
+ return &cg->css;
+}
+
+/**
+ * misc_cg_free() - Free the misc cgroup.
+ * @css: cgroup subsys object.
+ *
+ * Context: Any context.
+ */
+static void misc_cg_free(struct cgroup_subsys_state *css)
+{
+ kfree(css_misc(css));
+}
+
+/* Cgroup controller callbacks */
+struct cgroup_subsys misc_cgrp_subsys = {
+ .css_alloc = misc_cg_alloc,
+ .css_free = misc_cg_free,
+ .legacy_cftypes = misc_cg_files,
+ .dfl_cftypes = misc_cg_files,
+};
diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c
index b05f1dd58a62..0d5c29879a50 100644
--- a/kernel/cgroup/namespace.c
+++ b/kernel/cgroup/namespace.c
@@ -24,7 +24,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
struct cgroup_namespace *new_ns;
int ret;
- new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL);
+ new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL_ACCOUNT);
if (!new_ns)
return ERR_PTR(-ENOMEM);
ret = ns_alloc_inum(&new_ns->ns);
@@ -32,7 +32,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void)
kfree(new_ns);
return ERR_PTR(ret);
}
- refcount_set(&new_ns->count, 1);
+ refcount_set(&new_ns->ns.count, 1);
new_ns->ns.ops = &cgroupns_operations;
return new_ns;
}
@@ -95,11 +95,12 @@ static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns)
return container_of(ns, struct cgroup_namespace, ns);
}
-static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns)
+static int cgroupns_install(struct nsset *nsset, struct ns_common *ns)
{
+ struct nsproxy *nsproxy = nsset->nsproxy;
struct cgroup_namespace *cgroup_ns = to_cg_ns(ns);
- if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) ||
+ if (!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 138059eb730d..7695e60bcb40 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -33,6 +33,7 @@
#include <linux/atomic.h>
#include <linux/cgroup.h>
#include <linux/slab.h>
+#include <linux/sched/task.h>
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
#define PIDS_MAX_STR "max"
@@ -46,6 +47,7 @@ struct pids_cgroup {
*/
atomic64_t counter;
atomic64_t limit;
+ int64_t watermark;
/* Handle for "pids.events" */
struct cgroup_file events_file;
@@ -84,6 +86,16 @@ static void pids_css_free(struct cgroup_subsys_state *css)
kfree(css_pids(css));
}
+static void pids_update_watermark(struct pids_cgroup *p, int64_t nr_pids)
+{
+ /*
+ * This is racy, but we don't need perfectly accurate tallying of
+ * the watermark, and this lets us avoid extra atomic overhead.
+ */
+ if (nr_pids > READ_ONCE(p->watermark))
+ WRITE_ONCE(p->watermark, nr_pids);
+}
+
/**
* pids_cancel - uncharge the local pid count
* @pids: the pid cgroup state
@@ -127,8 +139,11 @@ static void pids_charge(struct pids_cgroup *pids, int num)
{
struct pids_cgroup *p;
- for (p = pids; parent_pids(p); p = parent_pids(p))
- atomic64_add(num, &p->counter);
+ for (p = pids; parent_pids(p); p = parent_pids(p)) {
+ int64_t new = atomic64_add_return(num, &p->counter);
+
+ pids_update_watermark(p, new);
+ }
}
/**
@@ -155,6 +170,12 @@ static int pids_try_charge(struct pids_cgroup *pids, int num)
*/
if (new > limit)
goto revert;
+
+ /*
+ * Not technically accurate if we go over limit somewhere up
+ * the hierarchy, but that's tolerable for the watermark.
+ */
+ pids_update_watermark(p, new);
}
return 0;
@@ -214,13 +235,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
* on cgroup_threadgroup_change_begin() held by the copy_process().
*/
-static int pids_can_fork(struct task_struct *task)
+static int pids_can_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
int err;
- css = task_css_check(current, pids_cgrp_id, true);
+ if (cset)
+ css = cset->subsys[pids_cgrp_id];
+ else
+ css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
err = pids_try_charge(pids, 1);
if (err) {
@@ -235,12 +259,15 @@ static int pids_can_fork(struct task_struct *task)
return err;
}
-static void pids_cancel_fork(struct task_struct *task)
+static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
{
struct cgroup_subsys_state *css;
struct pids_cgroup *pids;
- css = task_css_check(current, pids_cgrp_id, true);
+ if (cset)
+ css = cset->subsys[pids_cgrp_id];
+ else
+ css = task_css_check(current, pids_cgrp_id, true);
pids = css_pids(css);
pids_uncharge(pids, 1);
}
@@ -304,6 +331,14 @@ static s64 pids_current_read(struct cgroup_subsys_state *css,
return atomic64_read(&pids->counter);
}
+static s64 pids_peak_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct pids_cgroup *pids = css_pids(css);
+
+ return READ_ONCE(pids->watermark);
+}
+
static int pids_events_show(struct seq_file *sf, void *v)
{
struct pids_cgroup *pids = css_pids(seq_css(sf));
@@ -325,6 +360,11 @@ static struct cftype pids_files[] = {
.flags = CFTYPE_NOT_ON_ROOT,
},
{
+ .name = "peak",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = pids_peak_read,
+ },
+ {
.name = "events",
.seq_show = pids_events_show,
.file_offset = offsetof(struct pids_cgroup, events_file),
diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c
index ae042c347c64..3135406608c7 100644
--- a/kernel/cgroup/rdma.c
+++ b/kernel/cgroup/rdma.c
@@ -244,7 +244,7 @@ EXPORT_SYMBOL(rdmacg_uncharge);
* This function follows charging resource in hierarchical way.
* It will fail if the charge would cause the new value to exceed the
* hierarchical limit.
- * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
+ * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
* Returns pointer to rdmacg for this resource when charging is successful.
*
* Charger needs to account resources on two criteria.
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 6f87352f8219..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -25,34 +29,26 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu)
void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
{
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu);
- struct cgroup *parent;
unsigned long flags;
- /* nothing to do for root */
- if (!cgroup_parent(cgrp))
- return;
-
- /*
- * Paired with the one in cgroup_rstat_cpu_pop_updated(). Either we
- * see NULL updated_next or they see our updated stat.
- */
- smp_mb();
-
/*
+ * Speculative already-on-list test. This may race leading to
+ * temporary inaccuracies, which is fine.
+ *
* Because @parent's updated_children is terminated with @parent
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
/* put @cgrp and all ancestors on the corresponding updated lists */
- for (parent = cgroup_parent(cgrp); parent;
- cgrp = parent, parent = cgroup_parent(cgrp)) {
+ while (true) {
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_rstat_cpu *prstatc;
/*
* Both additions and removals are bottom-up. If a cgroup
@@ -61,13 +57,21 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
if (rstatc->updated_next)
break;
+ /* Root has no parent to link it to, but mark it busy */
+ if (!parent) {
+ rstatc->updated_next = cgrp;
+ break;
+ }
+
+ prstatc = cgroup_rstat_cpu(parent, cpu);
rstatc->updated_next = prstatc->updated_children;
prstatc->updated_children = cgrp;
+
+ cgrp = parent;
}
raw_spin_unlock_irqrestore(cpu_lock, flags);
}
-EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
/**
* cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree
@@ -75,7 +79,7 @@ EXPORT_SYMBOL_GPL(cgroup_rstat_updated);
* @root: root of the tree to traversal
* @cpu: target cpu
*
- * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts
+ * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts
* the traversal and %NULL return indicates the end. During traversal,
* each returned cgroup is unlinked from the tree. Must be called with the
* matching cgroup_rstat_cpu_lock held.
@@ -88,6 +92,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
if (pos == root)
return NULL;
@@ -96,10 +101,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
- if (!pos)
+ if (!pos) {
pos = root;
- else
+ /* return NULL if this subtree is not on-list */
+ if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
+ return NULL;
+ } else {
pos = cgroup_parent(pos);
+ }
/* walk down to the first leaf */
while (true) {
@@ -115,39 +124,52 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
- struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu);
- struct cgroup_rstat_cpu *nrstatc;
+ parent = cgroup_parent(pos);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
+ prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
- while (true) {
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
+ while (*nextp != pos) {
+ struct cgroup_rstat_cpu *nrstatc;
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
-
*nextp = rstatc->updated_next;
- rstatc->updated_next = NULL;
+ }
- /*
- * Paired with the one in cgroup_rstat_cpu_updated().
- * Either they see NULL updated_next or we see their
- * updated stat.
- */
- smp_mb();
+ rstatc->updated_next = NULL;
+ return pos;
+}
- return pos;
- }
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
- /* only happens for @root */
- return NULL;
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
}
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -160,12 +182,22 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -173,7 +205,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
@@ -224,7 +256,7 @@ void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp)
}
/**
- * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold
+ * cgroup_rstat_flush_hold - flush stats in @cgrp's subtree and hold
* @cgrp: target cgroup
*
* Flush stats in @cgrp's subtree and prevent further flushes. Must be
@@ -296,8 +328,6 @@ void __init cgroup_rstat_boot(void)
for_each_possible_cpu(cpu)
raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu));
-
- BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp));
}
/*
@@ -310,6 +340,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -318,29 +351,35 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
- struct cgroup *parent = cgroup_parent(cgrp);
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
- struct cgroup_base_stat cur, delta;
+ struct cgroup *parent = cgroup_parent(cgrp);
+ struct cgroup_base_stat delta;
unsigned seq;
+ /* Root-level stats are sourced from system-wide CPU stats */
+ if (!parent)
+ return;
+
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
- /* propagate global delta to parent */
- if (parent) {
+ /* propagate global delta to parent (unless that's root) */
+ if (cgroup_parent(parent)) {
delta = cgrp->bstat;
cgroup_base_stat_sub(&delta, &cgrp->last_bstat);
cgroup_base_stat_add(&parent->bstat, &delta);
@@ -349,19 +388,20 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
}
static struct cgroup_rstat_cpu *
-cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp)
+cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp, unsigned long *flags)
{
struct cgroup_rstat_cpu *rstatc;
rstatc = get_cpu_ptr(cgrp->rstat_cpu);
- u64_stats_update_begin(&rstatc->bsync);
+ *flags = u64_stats_update_begin_irqsave(&rstatc->bsync);
return rstatc;
}
static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
- struct cgroup_rstat_cpu *rstatc)
+ struct cgroup_rstat_cpu *rstatc,
+ unsigned long flags)
{
- u64_stats_update_end(&rstatc->bsync);
+ u64_stats_update_end_irqrestore(&rstatc->bsync, flags);
cgroup_rstat_updated(cgrp, smp_processor_id());
put_cpu_ptr(rstatc);
}
@@ -369,18 +409,20 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp,
void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
rstatc->bstat.cputime.sum_exec_runtime += delta_exec;
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
}
void __cgroup_account_cputime_field(struct cgroup *cgrp,
enum cpu_usage_stat index, u64 delta_exec)
{
struct cgroup_rstat_cpu *rstatc;
+ unsigned long flags;
- rstatc = cgroup_base_stat_cputime_account_begin(cgrp);
+ rstatc = cgroup_base_stat_cputime_account_begin(cgrp, &flags);
switch (index) {
case CPUTIME_USER:
@@ -392,32 +434,118 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
- cgroup_base_stat_cputime_account_end(cgrp, rstatc);
+ cgroup_base_stat_cputime_account_end(cgrp, rstatc, flags);
+}
+
+/*
+ * compute the cputime for the root cgroup by getting the per cpu data
+ * at a global level, then categorizing the fields in a manner consistent
+ * with how it is done by __cgroup_account_cputime_field for each bit of
+ * cpu time attributed to a cgroup.
+ */
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
+{
+ struct task_cputime *cputime = &bstat->cputime;
+ int i;
+
+ cputime->stime = 0;
+ cputime->utime = 0;
+ cputime->sum_exec_runtime = 0;
+ for_each_possible_cpu(i) {
+ struct kernel_cpustat kcpustat;
+ u64 *cpustat = kcpustat.cpustat;
+ u64 user = 0;
+ u64 sys = 0;
+
+ kcpustat_cpu_fetch(&kcpustat, i);
+
+ user += cpustat[CPUTIME_USER];
+ user += cpustat[CPUTIME_NICE];
+ cputime->utime += user;
+
+ sys += cpustat[CPUTIME_SYSTEM];
+ sys += cpustat[CPUTIME_IRQ];
+ sys += cpustat[CPUTIME_SOFTIRQ];
+ cputime->stime += sys;
+
+ cputime->sum_exec_runtime += user;
+ cputime->sum_exec_runtime += sys;
+ cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
+ }
}
void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
-
- if (!cgroup_parent(cgrp))
- return;
-
- cgroup_rstat_flush_hold(cgrp);
- usage = cgrp->bstat.cputime.sum_exec_runtime;
- cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime);
- cgroup_rstat_flush_release();
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
+
+ if (cgroup_parent(cgrp)) {
+ cgroup_rstat_flush_hold(cgrp);
+ usage = cgrp->bstat.cputime.sum_exec_runtime;
+ cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
+ &utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
+ cgroup_rstat_flush_release();
+ } else {
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
+ }
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
}
+late_initcall(bpf_rstat_kfunc_init);