Merge tag 'cgroup-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

Pull cgroup updates from Tejun Heo: - cgroup rstat shared the tracking tree across all controllers with the rationale being that a cgroup which is using one resource is likely to be using other resources at the same time (ie. if something is allocating memory, it's probably consuming CPU cycles). However, this turned out to not scale very well especially with memcg using rstat for internal operations which made memcg stat read and flush patterns substantially different from other controllers. JP Kobryn split the rstat tree per controller. - cgroup BPF support was hooking into cgroup init/exit paths directly. Convert them to use a notifier chain instead so that other usages can be added easily. The two of the patches which implement this are mislabeled as belonging to sched_ext instead of cgroup. Sorry. - Relatively minor cpuset updates - Documentation updates * tag 'cgroup-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (23 commits) sched_ext: Convert cgroup BPF support to use cgroup_lifetime_notifier sched_ext: Introduce cgroup_lifetime_notifier cgroup: Minor reorganization of cgroup_create() cgroup, docs: cpu controller's interaction with various scheduling policies cgroup, docs: convert space indentation to tab indentation cgroup: avoid per-cpu allocation of size zero rstat cpu locks cgroup, docs: be specific about bandwidth control of rt processes cgroup: document the rstat per-cpu initialization cgroup: helper for checking rstat participation of css cgroup: use subsystem-specific rstat locks to avoid contention cgroup: use separate rstat trees for each subsystem cgroup: compare css to cgroup::self in helper for distingushing css cgroup: warn on rstat usage by early init subsystems cgroup/cpuset: drop useless cpumask_empty() in compute_effective_exclusive_cpumask() cgroup/rstat: Improve cgroup_rstat_push_children() documentation cgroup: fix goto ordering in cgroup_init() cgroup: fix pointer check in css_rstat_init() cgroup/cpuset: Add warnings to catch inconsistency in exclusive CPUs cgroup/cpuset: Fix obsolete comment in cpuset_css_offline() cgroup/cpuset: Always use cpu_active_mask ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2025-05-27 20:59:53 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2025-05-27 20:59:53 -0700
commit: 3b66e6b3c098646f75c54e269dd963e2281c555c (patch)
tree: 79568dd6b16cac32b86ff646ddaaafd17dbb826e /include
parent: Merge tag 'wq-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq (diff)
parent: sched_ext: Convert cgroup BPF support to use cgroup_lifetime_notifier (diff)
download: wireguard-linux-3b66e6b3c098646f75c54e269dd963e2281c555c.tar.xz
wireguard-linux-3b66e6b3c098646f75c54e269dd963e2281c555c.zip
4 files changed, 102 insertions, 43 deletions
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 9de7adb68294..60d1511b4f4d 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -114,8 +114,7 @@ struct bpf_prog_list {
 	u32 flags;
 };
 
-int cgroup_bpf_inherit(struct cgroup *cgrp);
-void cgroup_bpf_offline(struct cgroup *cgrp);
+void __init cgroup_bpf_lifetime_notifier_init(void);
 
 int __cgroup_bpf_run_filter_skb(struct sock *sk,
 				struct sk_buff *skb,
@@ -431,8 +430,10 @@ const struct bpf_func_proto *
 cgroup_current_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog);
 #else
 
-static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
-static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_lifetime_notifier_init(void)
+{
+	return;
+}
 
 static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
 					 enum bpf_prog_type ptype,
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5bc8f55c8cca..e61687d5e496 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -170,6 +170,23 @@ struct cgroup_subsys_state {
 	struct percpu_ref refcnt;
 
 	/*
+	 * Depending on the context, this field is initialized
+	 * via css_rstat_init() at different places:
+	 *
+	 * when css is associated with cgroup::self
+	 *   when css->cgroup is the root cgroup
+	 *     performed in cgroup_init()
+	 *   when css->cgroup is not the root cgroup
+	 *     performed in cgroup_create()
+	 * when css is associated with a subsystem
+	 *   when css->cgroup is the root cgroup
+	 *     performed in cgroup_init_subsys() in the non-early path
+	 *   when css->cgroup is not the root cgroup
+	 *     performed in css_create()
+	 */
+	struct css_rstat_cpu __percpu *rstat_cpu;
+
+	/*
 	 * siblings list anchored at the parent's ->children
 	 *
 	 * linkage is protected by cgroup_mutex or RCU
@@ -177,9 +194,6 @@ struct cgroup_subsys_state {
 	struct list_head sibling;
 	struct list_head children;
 
-	/* flush target list anchored at cgrp->rstat_css_list */
-	struct list_head rstat_css_node;
-
 	/*
 	 * PI: Subsys-unique ID.  0 is unused and root is always 1.  The
 	 * matching css can be looked up using css_from_id().
@@ -219,6 +233,16 @@ struct cgroup_subsys_state {
 	 * Protected by cgroup_mutex.
 	 */
 	int nr_descendants;
+
+	/*
+	 * A singly-linked list of css structures to be rstat flushed.
+	 * This is a scratch field to be used exclusively by
+	 * css_rstat_flush().
+	 *
+	 * Protected by rstat_base_lock when css is cgroup::self.
+	 * Protected by css->ss->rstat_ss_lock otherwise.
+	 */
+	struct cgroup_subsys_state *rstat_flush_next;
 };
 
 /*
@@ -329,10 +353,10 @@ struct cgroup_base_stat {
 
 /*
  * rstat - cgroup scalable recursive statistics.  Accounting is done
- * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the
+ * per-cpu in css_rstat_cpu which is then lazily propagated up the
  * hierarchy on reads.
  *
- * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are
+ * When a stat gets updated, the css_rstat_cpu and its ancestors are
  * linked into the updated tree.  On the following read, propagation only
  * considers and consumes the updated tree.  This makes reading O(the
  * number of descendants which have been active since last read) instead of
@@ -344,10 +368,29 @@ struct cgroup_base_stat {
  * frequency decreases the cost of each read.
  *
  * This struct hosts both the fields which implement the above -
- * updated_children and updated_next - and the fields which track basic
- * resource statistics on top of it - bsync, bstat and last_bstat.
+ * updated_children and updated_next.
  */
-struct cgroup_rstat_cpu {
+struct css_rstat_cpu {
+	/*
+	 * Child cgroups with stat updates on this cpu since the last read
+	 * are linked on the parent's ->updated_children through
+	 * ->updated_next. updated_children is terminated by its container css.
+	 *
+	 * In addition to being more compact, singly-linked list pointing to
+	 * the css makes it unnecessary for each per-cpu struct to point back
+	 * to the associated css.
+	 *
+	 * Protected by per-cpu css->ss->rstat_ss_cpu_lock.
+	 */
+	struct cgroup_subsys_state *updated_children;
+	struct cgroup_subsys_state *updated_next;	/* NULL if not on the list */
+};
+
+/*
+ * This struct hosts the fields which track basic resource statistics on
+ * top of it - bsync, bstat and last_bstat.
+ */
+struct cgroup_rstat_base_cpu {
 	/*
 	 * ->bsync protects ->bstat.  These are the only fields which get
 	 * updated in the hot path.
@@ -374,20 +417,6 @@ struct cgroup_rstat_cpu {
 	 * deltas to propagate to the per-cpu subtree_bstat.
 	 */
 	struct cgroup_base_stat last_subtree_bstat;
-
-	/*
-	 * Child cgroups with stat updates on this cpu since the last read
-	 * are linked on the parent's ->updated_children through
-	 * ->updated_next.
-	 *
-	 * In addition to being more compact, singly-linked list pointing
-	 * to the cgroup makes it unnecessary for each per-cpu struct to
-	 * point back to the associated cgroup.
-	 *
-	 * Protected by per-cpu cgroup_rstat_cpu_lock.
-	 */
-	struct cgroup *updated_children;	/* terminated by self cgroup */
-	struct cgroup *updated_next;		/* NULL iff not on the list */
 };
 
 struct cgroup_freezer_state {
@@ -516,23 +545,23 @@ struct cgroup {
 	struct cgroup *dom_cgrp;
 	struct cgroup *old_dom_cgrp;		/* used while enabling threaded */
 
-	/* per-cpu recursive resource statistics */
-	struct cgroup_rstat_cpu __percpu *rstat_cpu;
-	struct list_head rstat_css_list;
-
 	/*
-	 * Add padding to separate the read mostly rstat_cpu and
-	 * rstat_css_list into a different cacheline from the following
-	 * rstat_flush_next and *bstat fields which can have frequent updates.
+	 * Depending on the context, this field is initialized via
+	 * css_rstat_init() at different places:
+	 *
+	 * when cgroup is the root cgroup
+	 *   performed in cgroup_setup_root()
+	 * otherwise
+	 *   performed in cgroup_create()
 	 */
-	CACHELINE_PADDING(_pad_);
+	struct cgroup_rstat_base_cpu __percpu *rstat_base_cpu;
 
 	/*
-	 * A singly-linked list of cgroup structures to be rstat flushed.
-	 * This is a scratch field to be used exclusively by
-	 * cgroup_rstat_flush_locked() and protected by cgroup_rstat_lock.
+	 * Add padding to keep the read mostly rstat per-cpu pointer on a
+	 * different cacheline than the following *bstat fields which can have
+	 * frequent updates.
 	 */
-	struct cgroup	*rstat_flush_next;
+	CACHELINE_PADDING(_pad_);
 
 	/* cgroup basic resource statistics */
 	struct cgroup_base_stat last_bstat;
@@ -790,6 +819,9 @@ struct cgroup_subsys {
 	 * specifies the mask of subsystems that this one depends on.
 	 */
 	unsigned int depends_on;
+
+	spinlock_t rstat_ss_lock;
+	raw_spinlock_t __percpu *rstat_ss_cpu_lock;
 };
 
 extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 166d6de50dbf..b18fb5fcb38e 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -19,6 +19,7 @@
 #include <linux/kernfs.h>
 #include <linux/jump_label.h>
 #include <linux/types.h>
+#include <linux/notifier.h>
 #include <linux/ns_common.h>
 #include <linux/nsproxy.h>
 #include <linux/user_namespace.h>
@@ -40,7 +41,7 @@ struct kernel_clone_args;
 
 #ifdef CONFIG_CGROUPS
 
-enum {
+enum css_task_iter_flags {
 	CSS_TASK_ITER_PROCS    = (1U << 0),  /* walk only threadgroup leaders */
 	CSS_TASK_ITER_THREADED = (1U << 1),  /* walk all threaded css_sets in the domain */
 	CSS_TASK_ITER_SKIPPED  = (1U << 16), /* internal flags */
@@ -66,10 +67,16 @@ struct css_task_iter {
 	struct list_head		iters_node;	/* css_set->task_iters */
 };
 
+enum cgroup_lifetime_events {
+	CGROUP_LIFETIME_ONLINE,
+	CGROUP_LIFETIME_OFFLINE,
+};
+
 extern struct file_system_type cgroup_fs_type;
 extern struct cgroup_root cgrp_dfl_root;
 extern struct css_set init_css_set;
 extern spinlock_t css_set_lock;
+extern struct blocking_notifier_head cgroup_lifetime_notifier;
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys;
 #include <linux/cgroup_subsys.h>
@@ -347,6 +354,17 @@ static inline bool css_is_dying(struct cgroup_subsys_state *css)
 	return css->flags & CSS_DYING;
 }
 
+static inline bool css_is_self(struct cgroup_subsys_state *css)
+{
+	if (css == &css->cgroup->self) {
+		/* cgroup::self should not have subsystem association */
+		WARN_ON(css->ss != NULL);
+		return true;
+	}
+
+	return false;
+}
+
 static inline void cgroup_get(struct cgroup *cgrp)
 {
 	css_get(&cgrp->self);
@@ -688,8 +706,8 @@ static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
 /*
  * cgroup scalable recursive statistics.
  */
-void cgroup_rstat_updated(struct cgroup *cgrp, int cpu);
-void cgroup_rstat_flush(struct cgroup *cgrp);
+void css_rstat_updated(struct cgroup_subsys_state *css, int cpu);
+void css_rstat_flush(struct cgroup_subsys_state *css);
 
 /*
  * Basic resource stats.
diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h
index af2755bda6eb..7d332387be6c 100644
--- a/include/trace/events/cgroup.h
+++ b/include/trace/events/cgroup.h
@@ -231,7 +231,11 @@ DECLARE_EVENT_CLASS(cgroup_rstat,
 		  __entry->cpu, __entry->contended)
 );
 
-/* Related to global: cgroup_rstat_lock */
+/*
+ * Related to locks:
+ * global rstat_base_lock for base stats
+ * cgroup_subsys::rstat_ss_lock for subsystem stats
+ */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_lock_contended,
 
 	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
@@ -253,7 +257,11 @@ DEFINE_EVENT(cgroup_rstat, cgroup_rstat_unlock,
 	TP_ARGS(cgrp, cpu, contended)
 );
 
-/* Related to per CPU: cgroup_rstat_cpu_lock */
+/*
+ * Related to per CPU locks:
+ * global rstat_base_cpu_lock for base stats
+ * cgroup_subsys::rstat_ss_cpu_lock for subsystem stats
+ */
 DEFINE_EVENT(cgroup_rstat, cgroup_rstat_cpu_lock_contended,
 
 	TP_PROTO(struct cgroup *cgrp, int cpu, bool contended),
author	Linus Torvalds <torvalds@linux-foundation.org>	2025-05-27 20:59:53 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2025-05-27 20:59:53 -0700
commit	3b66e6b3c098646f75c54e269dd963e2281c555c (patch)
tree	79568dd6b16cac32b86ff646ddaaafd17dbb826e /include
parent	Merge tag 'wq-for-6.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/wq (diff)
parent	sched_ext: Convert cgroup BPF support to use cgroup_lifetime_notifier (diff)
download	wireguard-linux-3b66e6b3c098646f75c54e269dd963e2281c555c.tar.xz wireguard-linux-3b66e6b3c098646f75c54e269dd963e2281c555c.zip