aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup/rstat.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/rstat.c')
-rw-r--r--kernel/cgroup/rstat.c161
1 files changed, 123 insertions, 38 deletions
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c
index 1486768f2318..793ecff29038 100644
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -3,6 +3,10 @@
#include <linux/sched/cputime.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+
static DEFINE_SPINLOCK(cgroup_rstat_lock);
static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock);
@@ -35,7 +39,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
- if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
+ if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
@@ -88,6 +92,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
+ struct cgroup *parent;
if (pos == root)
return NULL;
@@ -96,10 +101,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
- if (!pos)
+ if (!pos) {
pos = root;
- else
+ /* return NULL if this subtree is not on-list */
+ if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
+ return NULL;
+ } else {
pos = cgroup_parent(pos);
+ }
/* walk down to the first leaf */
while (true) {
@@ -115,35 +124,52 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
- if (rstatc->updated_next) {
- struct cgroup *parent = cgroup_parent(pos);
-
- if (parent) {
- struct cgroup_rstat_cpu *prstatc;
- struct cgroup **nextp;
-
- prstatc = cgroup_rstat_cpu(parent, cpu);
- nextp = &prstatc->updated_children;
- while (true) {
- struct cgroup_rstat_cpu *nrstatc;
-
- nrstatc = cgroup_rstat_cpu(*nextp, cpu);
- if (*nextp == pos)
- break;
- WARN_ON_ONCE(*nextp == parent);
- nextp = &nrstatc->updated_next;
- }
- *nextp = rstatc->updated_next;
- }
+ parent = cgroup_parent(pos);
+ if (parent) {
+ struct cgroup_rstat_cpu *prstatc;
+ struct cgroup **nextp;
+
+ prstatc = cgroup_rstat_cpu(parent, cpu);
+ nextp = &prstatc->updated_children;
+ while (*nextp != pos) {
+ struct cgroup_rstat_cpu *nrstatc;
- rstatc->updated_next = NULL;
- return pos;
+ nrstatc = cgroup_rstat_cpu(*nextp, cpu);
+ WARN_ON_ONCE(*nextp == parent);
+ nextp = &nrstatc->updated_next;
+ }
+ *nextp = rstatc->updated_next;
}
- /* only happens for @root */
- return NULL;
+ rstatc->updated_next = NULL;
+ return pos;
}
+/*
+ * A hook for bpf stat collectors to attach to and flush their stats.
+ * Together with providing bpf kfuncs for cgroup_rstat_updated() and
+ * cgroup_rstat_flush(), this enables a complete workflow where bpf progs that
+ * collect cgroup stats can integrate with rstat for efficient flushing.
+ *
+ * A static noinline declaration here could cause the compiler to optimize away
+ * the function. A global noinline declaration will keep the definition, but may
+ * optimize away the callsite. Therefore, __weak is needed to ensure that the
+ * call is still emitted, by telling the compiler that we don't know what the
+ * function might eventually be.
+ *
+ * __diag_* below are needed to dismiss the missing prototype warning.
+ */
+__diag_push();
+__diag_ignore_all("-Wmissing-prototypes",
+ "kfuncs which will be used in BPF programs");
+
+__weak noinline void bpf_rstat_flush(struct cgroup *cgrp,
+ struct cgroup *parent, int cpu)
+{
+}
+
+__diag_pop();
+
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
@@ -156,12 +182,22 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock,
cpu);
struct cgroup *pos = NULL;
+ unsigned long flags;
- raw_spin_lock(cpu_lock);
+ /*
+ * The _irqsave() is needed because cgroup_rstat_lock is
+ * spinlock_t which is a sleeping lock on PREEMPT_RT. Acquiring
+ * this lock with the _irq() suffix only disables interrupts on
+ * a non-PREEMPT_RT kernel. The raw_spinlock_t below disables
+ * interrupts on both configurations. The _irqsave() ensures
+ * that interrupts are always disabled and later restored.
+ */
+ raw_spin_lock_irqsave(cpu_lock, flags);
while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) {
struct cgroup_subsys_state *css;
cgroup_base_stat_flush(pos, cpu);
+ bpf_rstat_flush(pos, cgroup_parent(pos), cpu);
rcu_read_lock();
list_for_each_entry_rcu(css, &pos->rstat_css_list,
@@ -169,7 +205,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
css->ss->css_rstat_flush(css, cpu);
rcu_read_unlock();
}
- raw_spin_unlock(cpu_lock);
+ raw_spin_unlock_irqrestore(cpu_lock, flags);
/* if @may_sleep, play nice and yield if necessary */
if (may_sleep && (need_resched() ||
@@ -304,6 +340,9 @@ static void cgroup_base_stat_add(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime += src_bstat->cputime.utime;
dst_bstat->cputime.stime += src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum += src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
@@ -312,13 +351,16 @@ static void cgroup_base_stat_sub(struct cgroup_base_stat *dst_bstat,
dst_bstat->cputime.utime -= src_bstat->cputime.utime;
dst_bstat->cputime.stime -= src_bstat->cputime.stime;
dst_bstat->cputime.sum_exec_runtime -= src_bstat->cputime.sum_exec_runtime;
+#ifdef CONFIG_SCHED_CORE
+ dst_bstat->forceidle_sum -= src_bstat->forceidle_sum;
+#endif
}
static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
{
struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu);
struct cgroup *parent = cgroup_parent(cgrp);
- struct cgroup_base_stat cur, delta;
+ struct cgroup_base_stat delta;
unsigned seq;
/* Root-level stats are sourced from system-wide CPU stats */
@@ -328,11 +370,10 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu)
/* fetch the current per-cpu values */
do {
seq = __u64_stats_fetch_begin(&rstatc->bsync);
- cur.cputime = rstatc->bstat.cputime;
+ delta = rstatc->bstat;
} while (__u64_stats_fetch_retry(&rstatc->bsync, seq));
/* propagate percpu delta to global */
- delta = cur;
cgroup_base_stat_sub(&delta, &rstatc->last_bstat);
cgroup_base_stat_add(&cgrp->bstat, &delta);
cgroup_base_stat_add(&rstatc->last_bstat, &delta);
@@ -393,6 +434,11 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
case CPUTIME_SOFTIRQ:
rstatc->bstat.cputime.stime += delta_exec;
break;
+#ifdef CONFIG_SCHED_CORE
+ case CPUTIME_FORCEIDLE:
+ rstatc->bstat.forceidle_sum += delta_exec;
+ break;
+#endif
default:
break;
}
@@ -406,8 +452,9 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp,
* with how it is done by __cgroup_account_cputime_field for each bit of
* cpu time attributed to a cgroup.
*/
-static void root_cgroup_cputime(struct task_cputime *cputime)
+static void root_cgroup_cputime(struct cgroup_base_stat *bstat)
{
+ struct task_cputime *cputime = &bstat->cputime;
int i;
cputime->stime = 0;
@@ -433,6 +480,10 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
cputime->sum_exec_runtime += user;
cputime->sum_exec_runtime += sys;
cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
+
+#ifdef CONFIG_SCHED_CORE
+ bstat->forceidle_sum += cpustat[CPUTIME_FORCEIDLE];
+#endif
}
}
@@ -440,27 +491,61 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
u64 usage, utime, stime;
- struct task_cputime cputime;
+ struct cgroup_base_stat bstat;
+#ifdef CONFIG_SCHED_CORE
+ u64 forceidle_time;
+#endif
if (cgroup_parent(cgrp)) {
cgroup_rstat_flush_hold(cgrp);
usage = cgrp->bstat.cputime.sum_exec_runtime;
cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime,
&utime, &stime);
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = cgrp->bstat.forceidle_sum;
+#endif
cgroup_rstat_flush_release();
} else {
- root_cgroup_cputime(&cputime);
- usage = cputime.sum_exec_runtime;
- utime = cputime.utime;
- stime = cputime.stime;
+ root_cgroup_cputime(&bstat);
+ usage = bstat.cputime.sum_exec_runtime;
+ utime = bstat.cputime.utime;
+ stime = bstat.cputime.stime;
+#ifdef CONFIG_SCHED_CORE
+ forceidle_time = bstat.forceidle_sum;
+#endif
}
do_div(usage, NSEC_PER_USEC);
do_div(utime, NSEC_PER_USEC);
do_div(stime, NSEC_PER_USEC);
+#ifdef CONFIG_SCHED_CORE
+ do_div(forceidle_time, NSEC_PER_USEC);
+#endif
seq_printf(seq, "usage_usec %llu\n"
"user_usec %llu\n"
"system_usec %llu\n",
usage, utime, stime);
+
+#ifdef CONFIG_SCHED_CORE
+ seq_printf(seq, "core_sched.force_idle_usec %llu\n", forceidle_time);
+#endif
+}
+
+/* Add bpf kfuncs for cgroup_rstat_updated() and cgroup_rstat_flush() */
+BTF_SET8_START(bpf_rstat_kfunc_ids)
+BTF_ID_FLAGS(func, cgroup_rstat_updated)
+BTF_ID_FLAGS(func, cgroup_rstat_flush, KF_SLEEPABLE)
+BTF_SET8_END(bpf_rstat_kfunc_ids)
+
+static const struct btf_kfunc_id_set bpf_rstat_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_rstat_kfunc_ids,
+};
+
+static int __init bpf_rstat_kfunc_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING,
+ &bpf_rstat_kfunc_set);
}
+late_initcall(bpf_rstat_kfunc_init);