From 5faaf05f2976fd9ec0ecd582bcfb3a41cde4c65e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:04 -0700 Subject: cgroup: Explicitly remove core interface files The "cgroup." core interface files bypass the usual interface removal path and get removed recursively along with the cgroup itself. While this works now, the subtle discrepancy gets in the way of implementing common mechanisms. This patch updates cgroup core interface file handling so that it's consistent with controller interface files. When added, the css is marked CSS_VISIBLE and they're explicitly removed before the cgroup is destroyed. This doesn't cause user-visible behavior changes. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a662bfcbea0e..25309c868728 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1573,8 +1573,17 @@ static void css_clear_dir(struct cgroup_subsys_state *css) css->flags &= ~CSS_VISIBLE; - list_for_each_entry(cfts, &css->ss->cfts, node) + if (!css->ss) { + if (cgroup_on_dfl(cgrp)) + cfts = cgroup_base_files; + else + cfts = cgroup1_base_files; + cgroup_addrm_files(css, cgrp, cfts, false); + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) + cgroup_addrm_files(css, cgrp, cfts, false); + } } /** @@ -1598,14 +1607,16 @@ static int css_populate_dir(struct cgroup_subsys_state *css) else cfts = cgroup1_base_files; - return cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); - } - - list_for_each_entry(cfts, &css->ss->cfts, node) { - ret = cgroup_addrm_files(css, cgrp, cfts, true); - if (ret < 0) { - failed_cfts = cfts; - goto err; + ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true); + if (ret < 0) + return ret; + } else { + list_for_each_entry(cfts, &css->ss->cfts, node) { + ret = cgroup_addrm_files(css, cgrp, cfts, true); + if (ret < 0) { + failed_cfts = cfts; + goto err; + } } } @@ -5090,10 +5101,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) for_each_css(css, ssid, cgrp) kill_css(css); - /* - * Remove @cgrp directory along with the base files. @cgrp has an - * extra ref on its kn. - */ + /* clear and remove @cgrp dir, @cgrp has an extra ref on its kn */ + css_clear_dir(&cgrp->self); kernfs_remove(cgrp->kn); if (parent && cgroup_is_threaded(cgrp)) -- cgit v1.2.3-59-g8ed1b From b12e35832805bf120819b83d8fdb6e4d227a5ddb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:04 -0700 Subject: cgroup: Limit event generation frequency ".events" files generate file modified event to notify userland of possible new events. Some of the events can be quite bursty (e.g. memory high event) and generating notification each time is costly and pointless. This patch implements a event rate limit mechanism. If a new notification is requested before 10ms has passed since the previous notification, the new notification is delayed till then. As this only delays from the second notification on in a given close cluster of notifications, userland reactions to notifications shouldn't be delayed at all in most cases while avoiding notification storms. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 ++ kernel/cgroup/cgroup.c | 25 +++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index dc5b70449dc6..133531fcfb33 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -105,6 +105,8 @@ enum { struct cgroup_file { /* do not access any fields from outside cgroup core */ struct kernfs_node *kn; + unsigned long notified_at; + struct timer_list notify_timer; }; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 25309c868728..fdb7a582f8fc 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -61,6 +61,8 @@ #define CGROUP_FILE_NAME_MAX (MAX_CGROUP_TYPE_NAMELEN + \ MAX_CFTYPE_NAME + 2) +/* let's not notify more than 100 times per second */ +#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100) /* * cgroup_mutex is the master lock. Any modification to cgroup or its @@ -1554,6 +1556,8 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = NULL; spin_unlock_irq(&cgroup_file_kn_lock); + + del_timer_sync(&cfile->notify_timer); } kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); @@ -3532,6 +3536,12 @@ static int cgroup_kn_set_ugid(struct kernfs_node *kn) return kernfs_setattr(kn, &iattr); } +static void cgroup_file_notify_timer(struct timer_list *timer) +{ + cgroup_file_notify(container_of(timer, struct cgroup_file, + notify_timer)); +} + static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, struct cftype *cft) { @@ -3558,6 +3568,8 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; + timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; spin_unlock_irq(&cgroup_file_kn_lock); @@ -3807,8 +3819,17 @@ void cgroup_file_notify(struct cgroup_file *cfile) unsigned long flags; spin_lock_irqsave(&cgroup_file_kn_lock, flags); - if (cfile->kn) - kernfs_notify(cfile->kn); + if (cfile->kn) { + unsigned long last = cfile->notified_at; + unsigned long next = last + CGROUP_FILE_NOTIFY_MIN_INTV; + + if (time_in_range(jiffies, last, next)) { + timer_reduce(&cfile->notify_timer, next); + } else { + kernfs_notify(cfile->kn); + cfile->notified_at = jiffies; + } + } spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); } -- cgit v1.2.3-59-g8ed1b From a5c2b93f79ef7d746a3cd2c1bd66833286f9be70 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:04 -0700 Subject: cgroup: Rename kernel/cgroup/stat.c to kernel/cgroup/rstat.c stat is too generic a name and ends up causing subtle confusions. It'll be made generic so that controllers can plug into it, which will make the problem worse. Let's rename it to something more specific - cgroup_rstat for cgroup recursive stat. First, rename kernel/cgroup/stat.c to kernel/cgroup/rstat.c. No content changes. Signed-off-by: Tejun Heo --- kernel/cgroup/Makefile | 2 +- kernel/cgroup/rstat.c | 338 +++++++++++++++++++++++++++++++++++++++++++++++++ kernel/cgroup/stat.c | 338 ------------------------------------------------- 3 files changed, 339 insertions(+), 339 deletions(-) create mode 100644 kernel/cgroup/rstat.c delete mode 100644 kernel/cgroup/stat.c (limited to 'kernel') diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index 2be89a003185..bfcdae896122 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o stat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o obj-$(CONFIG_CGROUP_FREEZER) += freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c new file mode 100644 index 000000000000..1e111dd455c4 --- /dev/null +++ b/kernel/cgroup/rstat.c @@ -0,0 +1,338 @@ +#include "cgroup-internal.h" + +#include + +static DEFINE_MUTEX(cgroup_stat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); + +static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +{ + return per_cpu_ptr(cgrp->cpu_stat, cpu); +} + +/** + * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * @cgrp: target cgroup + * @cpu: cpu on which cpu_stat was updated + * + * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching + * cpu_stat->updated_children list. See the comment on top of + * cgroup_cpu_stat definition for details. + */ +static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +{ + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *parent; + unsigned long flags; + + /* + * Speculative already-on-list test. This may race leading to + * temporary inaccuracies, which is fine. + * + * Because @parent's updated_children is terminated with @parent + * instead of NULL, we can tell whether @cgrp is on the list by + * testing the next pointer for NULL. + */ + if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + return; + + raw_spin_lock_irqsave(cpu_lock, flags); + + /* put @cgrp and all ancestors on the corresponding updated lists */ + for (parent = cgroup_parent(cgrp); parent; + cgrp = parent, parent = cgroup_parent(cgrp)) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + + /* + * Both additions and removals are bottom-up. If a cgroup + * is already in the tree, all ancestors are. + */ + if (cstat->updated_next) + break; + + cstat->updated_next = pcstat->updated_children; + pcstat->updated_children = cgrp; + } + + raw_spin_unlock_irqrestore(cpu_lock, flags); +} + +/** + * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * @pos: current position + * @root: root of the tree to traversal + * @cpu: target cpu + * + * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * the traversal and %NULL return indicates the end. During traversal, + * each returned cgroup is unlinked from the tree. Must be called with the + * matching cgroup_cpu_stat_lock held. + * + * The only ordering guarantee is that, for a parent and a child pair + * covered by a given traversal, if a child is visited, its parent is + * guaranteed to be visited afterwards. + */ +static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) +{ + struct cgroup_cpu_stat *cstat; + struct cgroup *parent; + + if (pos == root) + return NULL; + + /* + * We're gonna walk down to the first leaf and visit/remove it. We + * can pick whatever unvisited node as the starting point. + */ + if (!pos) + pos = root; + else + pos = cgroup_parent(pos); + + /* walk down to the first leaf */ + while (true) { + cstat = cgroup_cpu_stat(pos, cpu); + if (cstat->updated_children == pos) + break; + pos = cstat->updated_children; + } + + /* + * Unlink @pos from the tree. As the updated_children list is + * singly linked, we have to walk it to find the removal point. + * However, due to the way we traverse, @pos will be the first + * child in most cases. The only exception is @root. + */ + parent = cgroup_parent(pos); + if (parent && cstat->updated_next) { + struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_cpu_stat *ncstat; + struct cgroup **nextp; + + nextp = &pcstat->updated_children; + while (true) { + ncstat = cgroup_cpu_stat(*nextp, cpu); + if (*nextp == pos) + break; + + WARN_ON_ONCE(*nextp == parent); + nextp = &ncstat->updated_next; + } + + *nextp = cstat->updated_next; + cstat->updated_next = NULL; + } + + return pos; +} + +static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, + struct cgroup_stat *src_stat) +{ + dst_stat->cputime.utime += src_stat->cputime.utime; + dst_stat->cputime.stime += src_stat->cputime.stime; + dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; +} + +static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +{ + struct cgroup *parent = cgroup_parent(cgrp); + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct task_cputime *last_cputime = &cstat->last_cputime; + struct task_cputime cputime; + struct cgroup_stat delta; + unsigned seq; + + lockdep_assert_held(&cgroup_stat_mutex); + + /* fetch the current per-cpu values */ + do { + seq = __u64_stats_fetch_begin(&cstat->sync); + cputime = cstat->cputime; + } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + + /* accumulate the deltas to propgate */ + delta.cputime.utime = cputime.utime - last_cputime->utime; + delta.cputime.stime = cputime.stime - last_cputime->stime; + delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - + last_cputime->sum_exec_runtime; + *last_cputime = cputime; + + /* transfer the pending stat into delta */ + cgroup_stat_accumulate(&delta, &cgrp->pending_stat); + memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + + /* propagate delta into the global stat and the parent's pending */ + cgroup_stat_accumulate(&cgrp->stat, &delta); + if (parent) + cgroup_stat_accumulate(&parent->pending_stat, &delta); +} + +/* see cgroup_stat_flush() */ +static void cgroup_stat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_stat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + cgroup_cpu_stat_flush_one(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_stat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_stat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_stat_mutex); + cgroup_stat_flush_locked(cgrp); + mutex_unlock(&cgroup_stat_mutex); +} + +static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +{ + struct cgroup_cpu_stat *cstat; + + cstat = get_cpu_ptr(cgrp->cpu_stat); + u64_stats_update_begin(&cstat->sync); + return cstat; +} + +static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, + struct cgroup_cpu_stat *cstat) +{ + u64_stats_update_end(&cstat->sync); + cgroup_cpu_stat_updated(cgrp, smp_processor_id()); + put_cpu_ptr(cstat); +} + +void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + cstat->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void __cgroup_account_cputime_field(struct cgroup *cgrp, + enum cpu_usage_stat index, u64 delta_exec) +{ + struct cgroup_cpu_stat *cstat; + + cstat = cgroup_cpu_stat_account_begin(cgrp); + + switch (index) { + case CPUTIME_USER: + case CPUTIME_NICE: + cstat->cputime.utime += delta_exec; + break; + case CPUTIME_SYSTEM: + case CPUTIME_IRQ: + case CPUTIME_SOFTIRQ: + cstat->cputime.stime += delta_exec; + break; + default: + break; + } + + cgroup_cpu_stat_account_end(cgrp, cstat); +} + +void cgroup_stat_show_cputime(struct seq_file *seq) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + u64 usage, utime, stime; + + if (!cgroup_parent(cgrp)) + return; + + mutex_lock(&cgroup_stat_mutex); + + cgroup_stat_flush_locked(cgrp); + + usage = cgrp->stat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, + &utime, &stime); + + mutex_unlock(&cgroup_stat_mutex); + + do_div(usage, NSEC_PER_USEC); + do_div(utime, NSEC_PER_USEC); + do_div(stime, NSEC_PER_USEC); + + seq_printf(seq, "usage_usec %llu\n" + "user_usec %llu\n" + "system_usec %llu\n", + usage, utime, stime); +} + +int cgroup_stat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has cpu_stat preallocated */ + if (!cgrp->cpu_stat) { + cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); + if (!cgrp->cpu_stat) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + cstat->updated_children = cgrp; + u64_stats_init(&cstat->sync); + } + + prev_cputime_init(&cgrp->stat.prev_cputime); + + return 0; +} + +void cgroup_stat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_stat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + + if (WARN_ON_ONCE(cstat->updated_children != cgrp) || + WARN_ON_ONCE(cstat->updated_next)) + return; + } + + free_percpu(cgrp->cpu_stat); + cgrp->cpu_stat = NULL; +} + +void __init cgroup_stat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + + BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); +} diff --git a/kernel/cgroup/stat.c b/kernel/cgroup/stat.c deleted file mode 100644 index 1e111dd455c4..000000000000 --- a/kernel/cgroup/stat.c +++ /dev/null @@ -1,338 +0,0 @@ -#include "cgroup-internal.h" - -#include - -static DEFINE_MUTEX(cgroup_stat_mutex); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); - -static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) -{ - return per_cpu_ptr(cgrp->cpu_stat, cpu); -} - -/** - * cgroup_cpu_stat_updated - keep track of updated cpu_stat - * @cgrp: target cgroup - * @cpu: cpu on which cpu_stat was updated - * - * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching - * cpu_stat->updated_children list. See the comment on top of - * cgroup_cpu_stat definition for details. - */ -static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) -{ - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *parent; - unsigned long flags; - - /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. - * - * Because @parent's updated_children is terminated with @parent - * instead of NULL, we can tell whether @cgrp is on the list by - * testing the next pointer for NULL. - */ - if (cgroup_cpu_stat(cgrp, cpu)->updated_next) - return; - - raw_spin_lock_irqsave(cpu_lock, flags); - - /* put @cgrp and all ancestors on the corresponding updated lists */ - for (parent = cgroup_parent(cgrp); parent; - cgrp = parent, parent = cgroup_parent(cgrp)) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - - /* - * Both additions and removals are bottom-up. If a cgroup - * is already in the tree, all ancestors are. - */ - if (cstat->updated_next) - break; - - cstat->updated_next = pcstat->updated_children; - pcstat->updated_children = cgrp; - } - - raw_spin_unlock_irqrestore(cpu_lock, flags); -} - -/** - * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree - * @pos: current position - * @root: root of the tree to traversal - * @cpu: target cpu - * - * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts - * the traversal and %NULL return indicates the end. During traversal, - * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_cpu_stat_lock held. - * - * The only ordering guarantee is that, for a parent and a child pair - * covered by a given traversal, if a child is visited, its parent is - * guaranteed to be visited afterwards. - */ -static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) -{ - struct cgroup_cpu_stat *cstat; - struct cgroup *parent; - - if (pos == root) - return NULL; - - /* - * We're gonna walk down to the first leaf and visit/remove it. We - * can pick whatever unvisited node as the starting point. - */ - if (!pos) - pos = root; - else - pos = cgroup_parent(pos); - - /* walk down to the first leaf */ - while (true) { - cstat = cgroup_cpu_stat(pos, cpu); - if (cstat->updated_children == pos) - break; - pos = cstat->updated_children; - } - - /* - * Unlink @pos from the tree. As the updated_children list is - * singly linked, we have to walk it to find the removal point. - * However, due to the way we traverse, @pos will be the first - * child in most cases. The only exception is @root. - */ - parent = cgroup_parent(pos); - if (parent && cstat->updated_next) { - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - struct cgroup_cpu_stat *ncstat; - struct cgroup **nextp; - - nextp = &pcstat->updated_children; - while (true) { - ncstat = cgroup_cpu_stat(*nextp, cpu); - if (*nextp == pos) - break; - - WARN_ON_ONCE(*nextp == parent); - nextp = &ncstat->updated_next; - } - - *nextp = cstat->updated_next; - cstat->updated_next = NULL; - } - - return pos; -} - -static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, - struct cgroup_stat *src_stat) -{ - dst_stat->cputime.utime += src_stat->cputime.utime; - dst_stat->cputime.stime += src_stat->cputime.stime; - dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; -} - -static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) -{ - struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct task_cputime *last_cputime = &cstat->last_cputime; - struct task_cputime cputime; - struct cgroup_stat delta; - unsigned seq; - - lockdep_assert_held(&cgroup_stat_mutex); - - /* fetch the current per-cpu values */ - do { - seq = __u64_stats_fetch_begin(&cstat->sync); - cputime = cstat->cputime; - } while (__u64_stats_fetch_retry(&cstat->sync, seq)); - - /* accumulate the deltas to propgate */ - delta.cputime.utime = cputime.utime - last_cputime->utime; - delta.cputime.stime = cputime.stime - last_cputime->stime; - delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - - last_cputime->sum_exec_runtime; - *last_cputime = cputime; - - /* transfer the pending stat into delta */ - cgroup_stat_accumulate(&delta, &cgrp->pending_stat); - memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); - - /* propagate delta into the global stat and the parent's pending */ - cgroup_stat_accumulate(&cgrp->stat, &delta); - if (parent) - cgroup_stat_accumulate(&parent->pending_stat, &delta); -} - -/* see cgroup_stat_flush() */ -static void cgroup_stat_flush_locked(struct cgroup *cgrp) -{ - int cpu; - - lockdep_assert_held(&cgroup_stat_mutex); - - for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); - struct cgroup *pos = NULL; - - raw_spin_lock_irq(cpu_lock); - while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) - cgroup_cpu_stat_flush_one(pos, cpu); - raw_spin_unlock_irq(cpu_lock); - } -} - -/** - * cgroup_stat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. - */ -void cgroup_stat_flush(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_stat_mutex); - cgroup_stat_flush_locked(cgrp); - mutex_unlock(&cgroup_stat_mutex); -} - -static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) -{ - struct cgroup_cpu_stat *cstat; - - cstat = get_cpu_ptr(cgrp->cpu_stat); - u64_stats_update_begin(&cstat->sync); - return cstat; -} - -static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, - struct cgroup_cpu_stat *cstat) -{ - u64_stats_update_end(&cstat->sync); - cgroup_cpu_stat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(cstat); -} - -void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - cstat->cputime.sum_exec_runtime += delta_exec; - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void __cgroup_account_cputime_field(struct cgroup *cgrp, - enum cpu_usage_stat index, u64 delta_exec) -{ - struct cgroup_cpu_stat *cstat; - - cstat = cgroup_cpu_stat_account_begin(cgrp); - - switch (index) { - case CPUTIME_USER: - case CPUTIME_NICE: - cstat->cputime.utime += delta_exec; - break; - case CPUTIME_SYSTEM: - case CPUTIME_IRQ: - case CPUTIME_SOFTIRQ: - cstat->cputime.stime += delta_exec; - break; - default: - break; - } - - cgroup_cpu_stat_account_end(cgrp, cstat); -} - -void cgroup_stat_show_cputime(struct seq_file *seq) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - u64 usage, utime, stime; - - if (!cgroup_parent(cgrp)) - return; - - mutex_lock(&cgroup_stat_mutex); - - cgroup_stat_flush_locked(cgrp); - - usage = cgrp->stat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, - &utime, &stime); - - mutex_unlock(&cgroup_stat_mutex); - - do_div(usage, NSEC_PER_USEC); - do_div(utime, NSEC_PER_USEC); - do_div(stime, NSEC_PER_USEC); - - seq_printf(seq, "usage_usec %llu\n" - "user_usec %llu\n" - "system_usec %llu\n", - usage, utime, stime); -} - -int cgroup_stat_init(struct cgroup *cgrp) -{ - int cpu; - - /* the root cgrp has cpu_stat preallocated */ - if (!cgrp->cpu_stat) { - cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); - if (!cgrp->cpu_stat) - return -ENOMEM; - } - - /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - cstat->updated_children = cgrp; - u64_stats_init(&cstat->sync); - } - - prev_cputime_init(&cgrp->stat.prev_cputime); - - return 0; -} - -void cgroup_stat_exit(struct cgroup *cgrp) -{ - int cpu; - - cgroup_stat_flush(cgrp); - - /* sanity check */ - for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - - if (WARN_ON_ONCE(cstat->updated_children != cgrp) || - WARN_ON_ONCE(cstat->updated_next)) - return; - } - - free_percpu(cgrp->cpu_stat); - cgrp->cpu_stat = NULL; -} - -void __init cgroup_stat_boot(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); - - BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); -} -- cgit v1.2.3-59-g8ed1b From c58632b3631cb222da41d9dc0dd39e106c1eafd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:04 -0700 Subject: cgroup: Rename stat to rstat stat is too generic a name and ends up causing subtle confusions. It'll be made generic so that controllers can plug into it, which will make the problem worse. Let's rename it to something more specific - cgroup_rstat for cgroup recursive stat. This patch does the following renames. No other changes. * cpu_stat -> rstat_cpu * stat -> rstat * ?cstat -> ?rstatc Note that the renames are selective. The unrenamed are the ones which implement basic resource statistics on top of rstat. This will be further cleaned up in the following patches. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 16 ++-- kernel/cgroup/cgroup-internal.h | 10 +-- kernel/cgroup/cgroup.c | 14 ++-- kernel/cgroup/rstat.c | 180 ++++++++++++++++++++-------------------- 4 files changed, 112 insertions(+), 108 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 133531fcfb33..04cb42419310 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -259,11 +259,11 @@ struct css_set { }; /* - * cgroup basic resource usage statistics. Accounting is done per-cpu in - * cgroup_cpu_stat which is then lazily propagated up the hierarchy on - * reads. + * rstat - cgroup scalable recursive statistics. Accounting is done + * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the + * hierarchy on reads. * - * When a stat gets updated, the cgroup_cpu_stat and its ancestors are + * When a stat gets updated, the cgroup_rstat_cpu and its ancestors are * linked into the updated tree. On the following read, propagation only * considers and consumes the updated tree. This makes reading O(the * number of descendants which have been active since last read) instead of @@ -274,7 +274,7 @@ struct css_set { * become very expensive. By propagating selectively, increasing reading * frequency decreases the cost of each read. */ -struct cgroup_cpu_stat { +struct cgroup_rstat_cpu { /* * ->sync protects all the current counters. These are the only * fields which get updated in the hot path. @@ -297,7 +297,7 @@ struct cgroup_cpu_stat { * to the cgroup makes it unnecessary for each per-cpu struct to * point back to the associated cgroup. * - * Protected by per-cpu cgroup_cpu_stat_lock. + * Protected by per-cpu cgroup_rstat_cpu_lock. */ struct cgroup *updated_children; /* terminated by self cgroup */ struct cgroup *updated_next; /* NULL iff not on the list */ @@ -408,8 +408,10 @@ struct cgroup { */ struct cgroup *dom_cgrp; + /* per-cpu recursive resource statistics */ + struct cgroup_rstat_cpu __percpu *rstat_cpu; + /* cgroup basic resource statistics */ - struct cgroup_cpu_stat __percpu *cpu_stat; struct cgroup_stat pending_stat; /* pending from children */ struct cgroup_stat stat; diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index b928b27050c6..092711114a1f 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -201,13 +201,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, int cgroup_task_count(const struct cgroup *cgrp); /* - * stat.c + * rstat.c */ -void cgroup_stat_flush(struct cgroup *cgrp); -int cgroup_stat_init(struct cgroup *cgrp); -void cgroup_stat_exit(struct cgroup *cgrp); +void cgroup_rstat_flush(struct cgroup *cgrp); +int cgroup_rstat_init(struct cgroup *cgrp); +void cgroup_rstat_exit(struct cgroup *cgrp); void cgroup_stat_show_cputime(struct seq_file *seq); -void cgroup_stat_boot(void); +void cgroup_rstat_boot(void); /* * namespace.c diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index fdb7a582f8fc..32eb7ce0ad71 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -144,14 +144,14 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = { }; #undef SUBSYS -static DEFINE_PER_CPU(struct cgroup_cpu_stat, cgrp_dfl_root_cpu_stat); +static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu); /* * The default hierarchy, reserved for the subsystems that are otherwise * unattached - it never has more than a single cgroup, and all tasks are * part of that cgroup. */ -struct cgroup_root cgrp_dfl_root = { .cgrp.cpu_stat = &cgrp_dfl_root_cpu_stat }; +struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu }; EXPORT_SYMBOL_GPL(cgrp_dfl_root); /* @@ -4592,7 +4592,7 @@ static void css_free_rwork_fn(struct work_struct *work) cgroup_put(cgroup_parent(cgrp)); kernfs_put(cgrp->kn); if (cgroup_on_dfl(cgrp)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); kfree(cgrp); } else { /* @@ -4629,7 +4629,7 @@ static void css_release_work_fn(struct work_struct *work) trace_cgroup_release(cgrp); if (cgroup_on_dfl(cgrp)) - cgroup_stat_flush(cgrp); + cgroup_rstat_flush(cgrp); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) @@ -4817,7 +4817,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) goto out_free_cgrp; if (cgroup_on_dfl(parent)) { - ret = cgroup_stat_init(cgrp); + ret = cgroup_rstat_init(cgrp); if (ret) goto out_cancel_ref; } @@ -4882,7 +4882,7 @@ out_idr_free: cgroup_idr_remove(&root->cgroup_idr, cgrp->id); out_stat_exit: if (cgroup_on_dfl(parent)) - cgroup_stat_exit(cgrp); + cgroup_rstat_exit(cgrp); out_cancel_ref: percpu_ref_exit(&cgrp->self.refcnt); out_free_cgrp: @@ -5275,7 +5275,7 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files)); - cgroup_stat_boot(); + cgroup_rstat_boot(); /* * The latency of the synchronize_sched() is too high for cgroups, diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1e111dd455c4..6824047b57a9 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -2,26 +2,26 @@ #include -static DEFINE_MUTEX(cgroup_stat_mutex); -static DEFINE_PER_CPU(raw_spinlock_t, cgroup_cpu_stat_lock); +static DEFINE_MUTEX(cgroup_rstat_mutex); +static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); -static struct cgroup_cpu_stat *cgroup_cpu_stat(struct cgroup *cgrp, int cpu) +static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) { - return per_cpu_ptr(cgrp->cpu_stat, cpu); + return per_cpu_ptr(cgrp->rstat_cpu, cpu); } /** - * cgroup_cpu_stat_updated - keep track of updated cpu_stat + * cgroup_rstat_cpu_updated - keep track of updated rstat_cpu * @cgrp: target cgroup - * @cpu: cpu on which cpu_stat was updated + * @cpu: cpu on which rstat_cpu was updated * - * @cgrp's cpu_stat on @cpu was updated. Put it on the parent's matching - * cpu_stat->updated_children list. See the comment on top of - * cgroup_cpu_stat definition for details. + * @cgrp's rstat_cpu on @cpu was updated. Put it on the parent's matching + * rstat_cpu->updated_children list. See the comment on top of + * cgroup_rstat_cpu definition for details. */ -static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) +static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *parent; unsigned long flags; @@ -33,7 +33,7 @@ static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ - if (cgroup_cpu_stat(cgrp, cpu)->updated_next) + if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) return; raw_spin_lock_irqsave(cpu_lock, flags); @@ -41,42 +41,42 @@ static void cgroup_cpu_stat_updated(struct cgroup *cgrp, int cpu) /* put @cgrp and all ancestors on the corresponding updated lists */ for (parent = cgroup_parent(cgrp); parent; cgrp = parent, parent = cgroup_parent(cgrp)) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); /* * Both additions and removals are bottom-up. If a cgroup * is already in the tree, all ancestors are. */ - if (cstat->updated_next) + if (rstatc->updated_next) break; - cstat->updated_next = pcstat->updated_children; - pcstat->updated_children = cgrp; + rstatc->updated_next = prstatc->updated_children; + prstatc->updated_children = cgrp; } raw_spin_unlock_irqrestore(cpu_lock, flags); } /** - * cgroup_cpu_stat_pop_updated - iterate and dismantle cpu_stat updated tree + * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree * @pos: current position * @root: root of the tree to traversal * @cpu: target cpu * - * Walks the udpated cpu_stat tree on @cpu from @root. %NULL @pos starts + * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts * the traversal and %NULL return indicates the end. During traversal, * each returned cgroup is unlinked from the tree. Must be called with the - * matching cgroup_cpu_stat_lock held. + * matching cgroup_rstat_cpu_lock held. * * The only ordering guarantee is that, for a parent and a child pair * covered by a given traversal, if a child is visited, its parent is * guaranteed to be visited afterwards. */ -static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, - struct cgroup *root, int cpu) +static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, + struct cgroup *root, int cpu) { - struct cgroup_cpu_stat *cstat; + struct cgroup_rstat_cpu *rstatc; struct cgroup *parent; if (pos == root) @@ -93,10 +93,10 @@ static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, /* walk down to the first leaf */ while (true) { - cstat = cgroup_cpu_stat(pos, cpu); - if (cstat->updated_children == pos) + rstatc = cgroup_rstat_cpu(pos, cpu); + if (rstatc->updated_children == pos) break; - pos = cstat->updated_children; + pos = rstatc->updated_children; } /* @@ -106,23 +106,23 @@ static struct cgroup *cgroup_cpu_stat_pop_updated(struct cgroup *pos, * child in most cases. The only exception is @root. */ parent = cgroup_parent(pos); - if (parent && cstat->updated_next) { - struct cgroup_cpu_stat *pcstat = cgroup_cpu_stat(parent, cpu); - struct cgroup_cpu_stat *ncstat; + if (parent && rstatc->updated_next) { + struct cgroup_rstat_cpu *prstatc = cgroup_rstat_cpu(parent, cpu); + struct cgroup_rstat_cpu *nrstatc; struct cgroup **nextp; - nextp = &pcstat->updated_children; + nextp = &prstatc->updated_children; while (true) { - ncstat = cgroup_cpu_stat(*nextp, cpu); + nrstatc = cgroup_rstat_cpu(*nextp, cpu); if (*nextp == pos) break; WARN_ON_ONCE(*nextp == parent); - nextp = &ncstat->updated_next; + nextp = &nrstatc->updated_next; } - *nextp = cstat->updated_next; - cstat->updated_next = NULL; + *nextp = rstatc->updated_next; + rstatc->updated_next = NULL; } return pos; @@ -139,19 +139,19 @@ static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); - struct task_cputime *last_cputime = &cstat->last_cputime; + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + struct task_cputime *last_cputime = &rstatc->last_cputime; struct task_cputime cputime; struct cgroup_stat delta; unsigned seq; - lockdep_assert_held(&cgroup_stat_mutex); + lockdep_assert_held(&cgroup_rstat_mutex); /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&cstat->sync); - cputime = cstat->cputime; - } while (__u64_stats_fetch_retry(&cstat->sync, seq)); + seq = __u64_stats_fetch_begin(&rstatc->sync); + cputime = rstatc->cputime; + } while (__u64_stats_fetch_retry(&rstatc->sync, seq)); /* accumulate the deltas to propgate */ delta.cputime.utime = cputime.utime - last_cputime->utime; @@ -170,26 +170,27 @@ static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) cgroup_stat_accumulate(&parent->pending_stat, &delta); } -/* see cgroup_stat_flush() */ -static void cgroup_stat_flush_locked(struct cgroup *cgrp) +/* see cgroup_rstat_flush() */ +static void cgroup_rstat_flush_locked(struct cgroup *cgrp) { int cpu; - lockdep_assert_held(&cgroup_stat_mutex); + lockdep_assert_held(&cgroup_rstat_mutex); for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_cpu_stat_lock, cpu); + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); struct cgroup *pos = NULL; raw_spin_lock_irq(cpu_lock); - while ((pos = cgroup_cpu_stat_pop_updated(pos, cgrp, cpu))) + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) cgroup_cpu_stat_flush_one(pos, cpu); raw_spin_unlock_irq(cpu_lock); } } /** - * cgroup_stat_flush - flush stats in @cgrp's subtree + * cgroup_rstat_flush - flush stats in @cgrp's subtree * @cgrp: target cgroup * * Collect all per-cpu stats in @cgrp's subtree into the global counters @@ -199,61 +200,62 @@ static void cgroup_stat_flush_locked(struct cgroup *cgrp) * This also gets all cgroups in the subtree including @cgrp off the * ->updated_children lists. */ -void cgroup_stat_flush(struct cgroup *cgrp) +void cgroup_rstat_flush(struct cgroup *cgrp) { - mutex_lock(&cgroup_stat_mutex); - cgroup_stat_flush_locked(cgrp); - mutex_unlock(&cgroup_stat_mutex); + mutex_lock(&cgroup_rstat_mutex); + cgroup_rstat_flush_locked(cgrp); + mutex_unlock(&cgroup_rstat_mutex); } -static struct cgroup_cpu_stat *cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +static struct cgroup_rstat_cpu * +cgroup_cpu_stat_account_begin(struct cgroup *cgrp) { - struct cgroup_cpu_stat *cstat; + struct cgroup_rstat_cpu *rstatc; - cstat = get_cpu_ptr(cgrp->cpu_stat); - u64_stats_update_begin(&cstat->sync); - return cstat; + rstatc = get_cpu_ptr(cgrp->rstat_cpu); + u64_stats_update_begin(&rstatc->sync); + return rstatc; } static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, - struct cgroup_cpu_stat *cstat) + struct cgroup_rstat_cpu *rstatc) { - u64_stats_update_end(&cstat->sync); - cgroup_cpu_stat_updated(cgrp, smp_processor_id()); - put_cpu_ptr(cstat); + u64_stats_update_end(&rstatc->sync); + cgroup_rstat_cpu_updated(cgrp, smp_processor_id()); + put_cpu_ptr(rstatc); } void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { - struct cgroup_cpu_stat *cstat; + struct cgroup_rstat_cpu *rstatc; - cstat = cgroup_cpu_stat_account_begin(cgrp); - cstat->cputime.sum_exec_runtime += delta_exec; - cgroup_cpu_stat_account_end(cgrp, cstat); + rstatc = cgroup_cpu_stat_account_begin(cgrp); + rstatc->cputime.sum_exec_runtime += delta_exec; + cgroup_cpu_stat_account_end(cgrp, rstatc); } void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec) { - struct cgroup_cpu_stat *cstat; + struct cgroup_rstat_cpu *rstatc; - cstat = cgroup_cpu_stat_account_begin(cgrp); + rstatc = cgroup_cpu_stat_account_begin(cgrp); switch (index) { case CPUTIME_USER: case CPUTIME_NICE: - cstat->cputime.utime += delta_exec; + rstatc->cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - cstat->cputime.stime += delta_exec; + rstatc->cputime.stime += delta_exec; break; default: break; } - cgroup_cpu_stat_account_end(cgrp, cstat); + cgroup_cpu_stat_account_end(cgrp, rstatc); } void cgroup_stat_show_cputime(struct seq_file *seq) @@ -264,15 +266,15 @@ void cgroup_stat_show_cputime(struct seq_file *seq) if (!cgroup_parent(cgrp)) return; - mutex_lock(&cgroup_stat_mutex); + mutex_lock(&cgroup_rstat_mutex); - cgroup_stat_flush_locked(cgrp); + cgroup_rstat_flush_locked(cgrp); usage = cgrp->stat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, &utime, &stime); - mutex_unlock(&cgroup_stat_mutex); + mutex_unlock(&cgroup_rstat_mutex); do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); @@ -284,23 +286,23 @@ void cgroup_stat_show_cputime(struct seq_file *seq) usage, utime, stime); } -int cgroup_stat_init(struct cgroup *cgrp) +int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; - /* the root cgrp has cpu_stat preallocated */ - if (!cgrp->cpu_stat) { - cgrp->cpu_stat = alloc_percpu(struct cgroup_cpu_stat); - if (!cgrp->cpu_stat) + /* the root cgrp has rstat_cpu preallocated */ + if (!cgrp->rstat_cpu) { + cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); + if (!cgrp->rstat_cpu) return -ENOMEM; } /* ->updated_children list is self terminated */ for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - cstat->updated_children = cgrp; - u64_stats_init(&cstat->sync); + rstatc->updated_children = cgrp; + u64_stats_init(&rstatc->sync); } prev_cputime_init(&cgrp->stat.prev_cputime); @@ -308,31 +310,31 @@ int cgroup_stat_init(struct cgroup *cgrp) return 0; } -void cgroup_stat_exit(struct cgroup *cgrp) +void cgroup_rstat_exit(struct cgroup *cgrp) { int cpu; - cgroup_stat_flush(cgrp); + cgroup_rstat_flush(cgrp); /* sanity check */ for_each_possible_cpu(cpu) { - struct cgroup_cpu_stat *cstat = cgroup_cpu_stat(cgrp, cpu); + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - if (WARN_ON_ONCE(cstat->updated_children != cgrp) || - WARN_ON_ONCE(cstat->updated_next)) + if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + WARN_ON_ONCE(rstatc->updated_next)) return; } - free_percpu(cgrp->cpu_stat); - cgrp->cpu_stat = NULL; + free_percpu(cgrp->rstat_cpu); + cgrp->rstat_cpu = NULL; } -void __init cgroup_stat_boot(void) +void __init cgroup_rstat_boot(void) { int cpu; for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_cpu_stat_lock, cpu)); + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - BUG_ON(cgroup_stat_init(&cgrp_dfl_root.cgrp)); + BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); } -- cgit v1.2.3-59-g8ed1b From d4ff749b5e0f1e2d4d69a3e4ea81cdeaeb4904d2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:04 -0700 Subject: cgroup: Distinguish base resource stat implementation from rstat Base resource stat accounts universial (not specific to any controller) resource consumptions on top of rstat. Currently, its implementation is intermixed with rstat implementation making the code confusing to follow. This patch clarifies the distintion by doing the followings. * Encapsulate base resource stat counters, currently only cputime, in struct cgroup_base_stat. * Move prev_cputime into struct cgroup and initialize it with cgroup. * Rename the related functions so that they start with cgroup_base_stat. * Prefix the related variables and field names with b. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 29 ++++++++++-------- kernel/cgroup/cgroup-internal.h | 2 +- kernel/cgroup/cgroup.c | 4 ++- kernel/cgroup/rstat.c | 67 ++++++++++++++++++++--------------------- 4 files changed, 52 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 04cb42419310..60d62fe97dc3 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -258,6 +258,10 @@ struct css_set { struct rcu_head rcu_head; }; +struct cgroup_base_stat { + struct task_cputime cputime; +}; + /* * rstat - cgroup scalable recursive statistics. Accounting is done * per-cpu in cgroup_rstat_cpu which is then lazily propagated up the @@ -273,20 +277,24 @@ struct css_set { * aren't active and stat may be read frequently. The combination can * become very expensive. By propagating selectively, increasing reading * frequency decreases the cost of each read. + * + * This struct hosts both the fields which implement the above - + * updated_children and updated_next - and the fields which track basic + * resource statistics on top of it - bsync, bstat and last_bstat. */ struct cgroup_rstat_cpu { /* - * ->sync protects all the current counters. These are the only - * fields which get updated in the hot path. + * ->bsync protects ->bstat. These are the only fields which get + * updated in the hot path. */ - struct u64_stats_sync sync; - struct task_cputime cputime; + struct u64_stats_sync bsync; + struct cgroup_base_stat bstat; /* * Snapshots at the last reading. These are used to calculate the * deltas to propagate to the global counters. */ - struct task_cputime last_cputime; + struct cgroup_base_stat last_bstat; /* * Child cgroups with stat updates on this cpu since the last read @@ -303,12 +311,6 @@ struct cgroup_rstat_cpu { struct cgroup *updated_next; /* NULL iff not on the list */ }; -struct cgroup_stat { - /* per-cpu statistics are collected into the folowing global counters */ - struct task_cputime cputime; - struct prev_cputime prev_cputime; -}; - struct cgroup { /* self css with NULL ->ss, points back to this cgroup */ struct cgroup_subsys_state self; @@ -412,8 +414,9 @@ struct cgroup { struct cgroup_rstat_cpu __percpu *rstat_cpu; /* cgroup basic resource statistics */ - struct cgroup_stat pending_stat; /* pending from children */ - struct cgroup_stat stat; + struct cgroup_base_stat pending_bstat; /* pending from children */ + struct cgroup_base_stat bstat; + struct prev_cputime prev_cputime; /* for printing out cputime */ /* * list of pidlists, up to two for each namespace (one for procs, one diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 092711114a1f..aab4d0a09670 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -206,7 +206,7 @@ int cgroup_task_count(const struct cgroup *cgrp); void cgroup_rstat_flush(struct cgroup *cgrp); int cgroup_rstat_init(struct cgroup *cgrp); void cgroup_rstat_exit(struct cgroup *cgrp); -void cgroup_stat_show_cputime(struct seq_file *seq); +void cgroup_base_stat_cputime_show(struct seq_file *seq); void cgroup_rstat_boot(void); /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 32eb7ce0ad71..31af98996692 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #define CREATE_TRACE_POINTS @@ -1859,6 +1860,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; + prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) INIT_LIST_HEAD(&cgrp->e_csets[ssid]); @@ -3396,7 +3398,7 @@ static int cpu_stat_show(struct seq_file *seq, void *v) struct cgroup __maybe_unused *cgrp = seq_css(seq)->cgroup; int ret = 0; - cgroup_stat_show_cputime(seq); + cgroup_base_stat_cputime_show(seq); #ifdef CONFIG_CGROUP_SCHED ret = cgroup_extra_stat_show(seq, cgrp, cpu_cgrp_id); #endif diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 6824047b57a9..7670191fa776 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -128,30 +128,30 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } -static void cgroup_stat_accumulate(struct cgroup_stat *dst_stat, - struct cgroup_stat *src_stat) +static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, + struct cgroup_base_stat *src_bstat) { - dst_stat->cputime.utime += src_stat->cputime.utime; - dst_stat->cputime.stime += src_stat->cputime.stime; - dst_stat->cputime.sum_exec_runtime += src_stat->cputime.sum_exec_runtime; + dst_bstat->cputime.utime += src_bstat->cputime.utime; + dst_bstat->cputime.stime += src_bstat->cputime.stime; + dst_bstat->cputime.sum_exec_runtime += src_bstat->cputime.sum_exec_runtime; } -static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - struct task_cputime *last_cputime = &rstatc->last_cputime; + struct task_cputime *last_cputime = &rstatc->last_bstat.cputime; struct task_cputime cputime; - struct cgroup_stat delta; + struct cgroup_base_stat delta; unsigned seq; lockdep_assert_held(&cgroup_rstat_mutex); /* fetch the current per-cpu values */ do { - seq = __u64_stats_fetch_begin(&rstatc->sync); - cputime = rstatc->cputime; - } while (__u64_stats_fetch_retry(&rstatc->sync, seq)); + seq = __u64_stats_fetch_begin(&rstatc->bsync); + cputime = rstatc->bstat.cputime; + } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); /* accumulate the deltas to propgate */ delta.cputime.utime = cputime.utime - last_cputime->utime; @@ -161,13 +161,13 @@ static void cgroup_cpu_stat_flush_one(struct cgroup *cgrp, int cpu) *last_cputime = cputime; /* transfer the pending stat into delta */ - cgroup_stat_accumulate(&delta, &cgrp->pending_stat); - memset(&cgrp->pending_stat, 0, sizeof(cgrp->pending_stat)); + cgroup_base_stat_accumulate(&delta, &cgrp->pending_bstat); + memset(&cgrp->pending_bstat, 0, sizeof(cgrp->pending_bstat)); /* propagate delta into the global stat and the parent's pending */ - cgroup_stat_accumulate(&cgrp->stat, &delta); + cgroup_base_stat_accumulate(&cgrp->bstat, &delta); if (parent) - cgroup_stat_accumulate(&parent->pending_stat, &delta); + cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); } /* see cgroup_rstat_flush() */ @@ -184,7 +184,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) raw_spin_lock_irq(cpu_lock); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) - cgroup_cpu_stat_flush_one(pos, cpu); + cgroup_base_stat_flush(pos, cpu); raw_spin_unlock_irq(cpu_lock); } } @@ -208,19 +208,19 @@ void cgroup_rstat_flush(struct cgroup *cgrp) } static struct cgroup_rstat_cpu * -cgroup_cpu_stat_account_begin(struct cgroup *cgrp) +cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) { struct cgroup_rstat_cpu *rstatc; rstatc = get_cpu_ptr(cgrp->rstat_cpu); - u64_stats_update_begin(&rstatc->sync); + u64_stats_update_begin(&rstatc->bsync); return rstatc; } -static void cgroup_cpu_stat_account_end(struct cgroup *cgrp, - struct cgroup_rstat_cpu *rstatc) +static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, + struct cgroup_rstat_cpu *rstatc) { - u64_stats_update_end(&rstatc->sync); + u64_stats_update_end(&rstatc->bsync); cgroup_rstat_cpu_updated(cgrp, smp_processor_id()); put_cpu_ptr(rstatc); } @@ -229,9 +229,9 @@ void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec) { struct cgroup_rstat_cpu *rstatc; - rstatc = cgroup_cpu_stat_account_begin(cgrp); - rstatc->cputime.sum_exec_runtime += delta_exec; - cgroup_cpu_stat_account_end(cgrp, rstatc); + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); + rstatc->bstat.cputime.sum_exec_runtime += delta_exec; + cgroup_base_stat_cputime_account_end(cgrp, rstatc); } void __cgroup_account_cputime_field(struct cgroup *cgrp, @@ -239,26 +239,26 @@ void __cgroup_account_cputime_field(struct cgroup *cgrp, { struct cgroup_rstat_cpu *rstatc; - rstatc = cgroup_cpu_stat_account_begin(cgrp); + rstatc = cgroup_base_stat_cputime_account_begin(cgrp); switch (index) { case CPUTIME_USER: case CPUTIME_NICE: - rstatc->cputime.utime += delta_exec; + rstatc->bstat.cputime.utime += delta_exec; break; case CPUTIME_SYSTEM: case CPUTIME_IRQ: case CPUTIME_SOFTIRQ: - rstatc->cputime.stime += delta_exec; + rstatc->bstat.cputime.stime += delta_exec; break; default: break; } - cgroup_cpu_stat_account_end(cgrp, rstatc); + cgroup_base_stat_cputime_account_end(cgrp, rstatc); } -void cgroup_stat_show_cputime(struct seq_file *seq) +void cgroup_base_stat_cputime_show(struct seq_file *seq) { struct cgroup *cgrp = seq_css(seq)->cgroup; u64 usage, utime, stime; @@ -270,9 +270,8 @@ void cgroup_stat_show_cputime(struct seq_file *seq) cgroup_rstat_flush_locked(cgrp); - usage = cgrp->stat.cputime.sum_exec_runtime; - cputime_adjust(&cgrp->stat.cputime, &cgrp->stat.prev_cputime, - &utime, &stime); + usage = cgrp->bstat.cputime.sum_exec_runtime; + cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); mutex_unlock(&cgroup_rstat_mutex); @@ -302,11 +301,9 @@ int cgroup_rstat_init(struct cgroup *cgrp) struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->sync); + u64_stats_init(&rstatc->bsync); } - prev_cputime_init(&cgrp->stat.prev_cputime); - return 0; } -- cgit v1.2.3-59-g8ed1b From a17556f8d9798e8feff9e34d746e489e78ee1dab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:05 -0700 Subject: cgroup: Reorganize kernel/cgroup/rstat.c Currently, rstat.c has rstat and base stat implementations intermixed. Collect base stat implementation at the end of the file. Also, reorder the prototypes. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 +- kernel/cgroup/rstat.c | 182 +++++++++++++++++++++------------------- 2 files changed, 95 insertions(+), 89 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index aab4d0a09670..2bf6fb417588 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -206,8 +206,8 @@ int cgroup_task_count(const struct cgroup *cgrp); void cgroup_rstat_flush(struct cgroup *cgrp); int cgroup_rstat_init(struct cgroup *cgrp); void cgroup_rstat_exit(struct cgroup *cgrp); -void cgroup_base_stat_cputime_show(struct seq_file *seq); void cgroup_rstat_boot(void); +void cgroup_base_stat_cputime_show(struct seq_file *seq); /* * namespace.c diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 7670191fa776..87d7252769e7 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -5,6 +5,8 @@ static DEFINE_MUTEX(cgroup_rstat_mutex); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); +static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); + static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) { return per_cpu_ptr(cgrp->rstat_cpu, cpu); @@ -128,6 +130,98 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, return pos; } +/* see cgroup_rstat_flush() */ +static void cgroup_rstat_flush_locked(struct cgroup *cgrp) +{ + int cpu; + + lockdep_assert_held(&cgroup_rstat_mutex); + + for_each_possible_cpu(cpu) { + raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, + cpu); + struct cgroup *pos = NULL; + + raw_spin_lock_irq(cpu_lock); + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) + cgroup_base_stat_flush(pos, cpu); + raw_spin_unlock_irq(cpu_lock); + } +} + +/** + * cgroup_rstat_flush - flush stats in @cgrp's subtree + * @cgrp: target cgroup + * + * Collect all per-cpu stats in @cgrp's subtree into the global counters + * and propagate them upwards. After this function returns, all cgroups in + * the subtree have up-to-date ->stat. + * + * This also gets all cgroups in the subtree including @cgrp off the + * ->updated_children lists. + */ +void cgroup_rstat_flush(struct cgroup *cgrp) +{ + mutex_lock(&cgroup_rstat_mutex); + cgroup_rstat_flush_locked(cgrp); + mutex_unlock(&cgroup_rstat_mutex); +} + +int cgroup_rstat_init(struct cgroup *cgrp) +{ + int cpu; + + /* the root cgrp has rstat_cpu preallocated */ + if (!cgrp->rstat_cpu) { + cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); + if (!cgrp->rstat_cpu) + return -ENOMEM; + } + + /* ->updated_children list is self terminated */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + rstatc->updated_children = cgrp; + u64_stats_init(&rstatc->bsync); + } + + return 0; +} + +void cgroup_rstat_exit(struct cgroup *cgrp) +{ + int cpu; + + cgroup_rstat_flush(cgrp); + + /* sanity check */ + for_each_possible_cpu(cpu) { + struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); + + if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || + WARN_ON_ONCE(rstatc->updated_next)) + return; + } + + free_percpu(cgrp->rstat_cpu); + cgrp->rstat_cpu = NULL; +} + +void __init cgroup_rstat_boot(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); + + BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); +} + +/* + * Functions for cgroup basic resource statistics implemented on top of + * rstat. + */ static void cgroup_base_stat_accumulate(struct cgroup_base_stat *dst_bstat, struct cgroup_base_stat *src_bstat) { @@ -170,43 +264,6 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) cgroup_base_stat_accumulate(&parent->pending_bstat, &delta); } -/* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) -{ - int cpu; - - lockdep_assert_held(&cgroup_rstat_mutex); - - for_each_possible_cpu(cpu) { - raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, - cpu); - struct cgroup *pos = NULL; - - raw_spin_lock_irq(cpu_lock); - while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) - cgroup_base_stat_flush(pos, cpu); - raw_spin_unlock_irq(cpu_lock); - } -} - -/** - * cgroup_rstat_flush - flush stats in @cgrp's subtree - * @cgrp: target cgroup - * - * Collect all per-cpu stats in @cgrp's subtree into the global counters - * and propagate them upwards. After this function returns, all cgroups in - * the subtree have up-to-date ->stat. - * - * This also gets all cgroups in the subtree including @cgrp off the - * ->updated_children lists. - */ -void cgroup_rstat_flush(struct cgroup *cgrp) -{ - mutex_lock(&cgroup_rstat_mutex); - cgroup_rstat_flush_locked(cgrp); - mutex_unlock(&cgroup_rstat_mutex); -} - static struct cgroup_rstat_cpu * cgroup_base_stat_cputime_account_begin(struct cgroup *cgrp) { @@ -284,54 +341,3 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) "system_usec %llu\n", usage, utime, stime); } - -int cgroup_rstat_init(struct cgroup *cgrp) -{ - int cpu; - - /* the root cgrp has rstat_cpu preallocated */ - if (!cgrp->rstat_cpu) { - cgrp->rstat_cpu = alloc_percpu(struct cgroup_rstat_cpu); - if (!cgrp->rstat_cpu) - return -ENOMEM; - } - - /* ->updated_children list is self terminated */ - for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - - rstatc->updated_children = cgrp; - u64_stats_init(&rstatc->bsync); - } - - return 0; -} - -void cgroup_rstat_exit(struct cgroup *cgrp) -{ - int cpu; - - cgroup_rstat_flush(cgrp); - - /* sanity check */ - for_each_possible_cpu(cpu) { - struct cgroup_rstat_cpu *rstatc = cgroup_rstat_cpu(cgrp, cpu); - - if (WARN_ON_ONCE(rstatc->updated_children != cgrp) || - WARN_ON_ONCE(rstatc->updated_next)) - return; - } - - free_percpu(cgrp->rstat_cpu); - cgrp->rstat_cpu = NULL; -} - -void __init cgroup_rstat_boot(void) -{ - int cpu; - - for_each_possible_cpu(cpu) - raw_spin_lock_init(per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu)); - - BUG_ON(cgroup_rstat_init(&cgrp_dfl_root.cgrp)); -} -- cgit v1.2.3-59-g8ed1b From 6162cef0f741c70eb0c7ac7e6142f85808d8abc4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:05 -0700 Subject: cgroup: Factor out and expose cgroup_rstat_*() interface functions cgroup_rstat is being generalized so that controllers can use it too. This patch factors out and exposes the following interface functions. * cgroup_rstat_updated(): Renamed from cgroup_rstat_cpu_updated() for consistency. * cgroup_rstat_flush_hold/release(): Factored out from base stat implementation. * cgroup_rstat_flush(): Verbatim expose. While at it, drop assert on cgroup_rstat_mutex in cgroup_base_stat_flush() as it crosses layers and make a minor comment update. v2: Added EXPORT_SYMBOL_GPL(cgroup_rstat_updated) to fix a build bug. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 11 +++++++++-- kernel/cgroup/cgroup-internal.h | 1 - kernel/cgroup/rstat.c | 42 +++++++++++++++++++++++++++++------------ 3 files changed, 39 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 473e0c0abb86..5c6018fef5aa 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -690,11 +690,18 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ +#ifdef CONFIG_CGROUPS /* - * Basic resource stats. + * cgroup scalable recursive statistics. */ -#ifdef CONFIG_CGROUPS +void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); +void cgroup_rstat_flush(struct cgroup *cgrp); +void cgroup_rstat_flush_hold(struct cgroup *cgrp); +void cgroup_rstat_flush_release(void); +/* + * Basic resource stats. + */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 2bf6fb417588..b68e1a7c146c 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -203,7 +203,6 @@ int cgroup_task_count(const struct cgroup *cgrp); /* * rstat.c */ -void cgroup_rstat_flush(struct cgroup *cgrp); int cgroup_rstat_init(struct cgroup *cgrp); void cgroup_rstat_exit(struct cgroup *cgrp); void cgroup_rstat_boot(void); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 87d7252769e7..d49bf92ac3d4 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -13,7 +13,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) } /** - * cgroup_rstat_cpu_updated - keep track of updated rstat_cpu + * cgroup_rstat_updated - keep track of updated rstat_cpu * @cgrp: target cgroup * @cpu: cpu on which rstat_cpu was updated * @@ -21,7 +21,7 @@ static struct cgroup_rstat_cpu *cgroup_rstat_cpu(struct cgroup *cgrp, int cpu) * rstat_cpu->updated_children list. See the comment on top of * cgroup_rstat_cpu definition for details. */ -static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu) +void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *parent; @@ -59,6 +59,7 @@ static void cgroup_rstat_cpu_updated(struct cgroup *cgrp, int cpu) raw_spin_unlock_irqrestore(cpu_lock, flags); } +EXPORT_SYMBOL_GPL(cgroup_rstat_updated); /** * cgroup_rstat_cpu_pop_updated - iterate and dismantle rstat_cpu updated tree @@ -167,6 +168,29 @@ void cgroup_rstat_flush(struct cgroup *cgrp) mutex_unlock(&cgroup_rstat_mutex); } +/** + * cgroup_rstat_flush_begin - flush stats in @cgrp's subtree and hold + * @cgrp: target cgroup + * + * Flush stats in @cgrp's subtree and prevent further flushes. Must be + * paired with cgroup_rstat_flush_release(). + */ +void cgroup_rstat_flush_hold(struct cgroup *cgrp) + __acquires(&cgroup_rstat_mutex) +{ + mutex_lock(&cgroup_rstat_mutex); + cgroup_rstat_flush_locked(cgrp); +} + +/** + * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() + */ +void cgroup_rstat_flush_release(void) + __releases(&cgroup_rstat_mutex) +{ + mutex_unlock(&cgroup_rstat_mutex); +} + int cgroup_rstat_init(struct cgroup *cgrp) { int cpu; @@ -239,15 +263,13 @@ static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu) struct cgroup_base_stat delta; unsigned seq; - lockdep_assert_held(&cgroup_rstat_mutex); - /* fetch the current per-cpu values */ do { seq = __u64_stats_fetch_begin(&rstatc->bsync); cputime = rstatc->bstat.cputime; } while (__u64_stats_fetch_retry(&rstatc->bsync, seq)); - /* accumulate the deltas to propgate */ + /* calculate the delta to propgate */ delta.cputime.utime = cputime.utime - last_cputime->utime; delta.cputime.stime = cputime.stime - last_cputime->stime; delta.cputime.sum_exec_runtime = cputime.sum_exec_runtime - @@ -278,7 +300,7 @@ static void cgroup_base_stat_cputime_account_end(struct cgroup *cgrp, struct cgroup_rstat_cpu *rstatc) { u64_stats_update_end(&rstatc->bsync); - cgroup_rstat_cpu_updated(cgrp, smp_processor_id()); + cgroup_rstat_updated(cgrp, smp_processor_id()); put_cpu_ptr(rstatc); } @@ -323,14 +345,10 @@ void cgroup_base_stat_cputime_show(struct seq_file *seq) if (!cgroup_parent(cgrp)) return; - mutex_lock(&cgroup_rstat_mutex); - - cgroup_rstat_flush_locked(cgrp); - + cgroup_rstat_flush_hold(cgrp); usage = cgrp->bstat.cputime.sum_exec_runtime; cputime_adjust(&cgrp->bstat.cputime, &cgrp->prev_cputime, &utime, &stime); - - mutex_unlock(&cgroup_rstat_mutex); + cgroup_rstat_flush_release(); do_div(usage, NSEC_PER_USEC); do_div(utime, NSEC_PER_USEC); -- cgit v1.2.3-59-g8ed1b From 0fa294fb1985c06c4e3325e30e759d4ca580f59a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:05 -0700 Subject: cgroup: Replace cgroup_rstat_mutex with a spinlock Currently, rstat flush path is protected with a mutex which is fine as all the existing users are from interface file show path. However, rstat is being generalized for use by controllers and flushing from atomic contexts will be necessary. This patch replaces cgroup_rstat_mutex with a spinlock and adds a irq-safe flush function - cgroup_rstat_flush_irqsafe(). Explicit yield handling is added to the flush path so that other flush functions can yield to other threads and flushers. Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 1 + kernel/cgroup/rstat.c | 58 +++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 46 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5c6018fef5aa..c9fdf6f57913 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -696,6 +696,7 @@ static inline void cgroup_path_from_kernfs_id(const union kernfs_node_id *id, */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); +void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d49bf92ac3d4..3386fb251a9e 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -2,7 +2,7 @@ #include -static DEFINE_MUTEX(cgroup_rstat_mutex); +static DEFINE_SPINLOCK(cgroup_rstat_lock); static DEFINE_PER_CPU(raw_spinlock_t, cgroup_rstat_cpu_lock); static void cgroup_base_stat_flush(struct cgroup *cgrp, int cpu); @@ -132,21 +132,31 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, } /* see cgroup_rstat_flush() */ -static void cgroup_rstat_flush_locked(struct cgroup *cgrp) +static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) + __releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock) { int cpu; - lockdep_assert_held(&cgroup_rstat_mutex); + lockdep_assert_held(&cgroup_rstat_lock); for_each_possible_cpu(cpu) { raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; - raw_spin_lock_irq(cpu_lock); + raw_spin_lock(cpu_lock); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) cgroup_base_stat_flush(pos, cpu); - raw_spin_unlock_irq(cpu_lock); + raw_spin_unlock(cpu_lock); + + /* if @may_sleep, play nice and yield if necessary */ + if (may_sleep && (need_resched() || + spin_needbreak(&cgroup_rstat_lock))) { + spin_unlock_irq(&cgroup_rstat_lock); + if (!cond_resched()) + cpu_relax(); + spin_lock_irq(&cgroup_rstat_lock); + } } } @@ -160,12 +170,31 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp) * * This also gets all cgroups in the subtree including @cgrp off the * ->updated_children lists. + * + * This function may block. */ void cgroup_rstat_flush(struct cgroup *cgrp) { - mutex_lock(&cgroup_rstat_mutex); - cgroup_rstat_flush_locked(cgrp); - mutex_unlock(&cgroup_rstat_mutex); + might_sleep(); + + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); + spin_unlock_irq(&cgroup_rstat_lock); +} + +/** + * cgroup_rstat_flush_irqsafe - irqsafe version of cgroup_rstat_flush() + * @cgrp: target cgroup + * + * This function can be called from any context. + */ +void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_rstat_lock, flags); + cgroup_rstat_flush_locked(cgrp, false); + spin_unlock_irqrestore(&cgroup_rstat_lock, flags); } /** @@ -174,21 +203,24 @@ void cgroup_rstat_flush(struct cgroup *cgrp) * * Flush stats in @cgrp's subtree and prevent further flushes. Must be * paired with cgroup_rstat_flush_release(). + * + * This function may block. */ void cgroup_rstat_flush_hold(struct cgroup *cgrp) - __acquires(&cgroup_rstat_mutex) + __acquires(&cgroup_rstat_lock) { - mutex_lock(&cgroup_rstat_mutex); - cgroup_rstat_flush_locked(cgrp); + might_sleep(); + spin_lock_irq(&cgroup_rstat_lock); + cgroup_rstat_flush_locked(cgrp, true); } /** * cgroup_rstat_flush_release - release cgroup_rstat_flush_hold() */ void cgroup_rstat_flush_release(void) - __releases(&cgroup_rstat_mutex) + __releases(&cgroup_rstat_lock) { - mutex_unlock(&cgroup_rstat_mutex); + spin_unlock_irq(&cgroup_rstat_lock); } int cgroup_rstat_init(struct cgroup *cgrp) -- cgit v1.2.3-59-g8ed1b From 8f53470bab04229e93ff9e4c20338cc08b42b344 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:05 -0700 Subject: cgroup: Add cgroup_subsys->css_rstat_flush() This patch adds cgroup_subsys->css_rstat_flush(). If a subsystem has this callback, its csses are linked on cgrp->css_rstat_list and rstat will call the function whenever the associated cgroup is flushed. Flush is also performed when such csses are released so that residual counts aren't lost. Combined with the rstat API previous patches factored out, this allows controllers to plug into rstat to manage their statistics in a scalable way. Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 5 +++++ kernel/cgroup/cgroup.c | 11 +++++++++++ kernel/cgroup/rstat.c | 11 ++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 60d62fe97dc3..c0e68f903011 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -130,6 +130,9 @@ struct cgroup_subsys_state { struct list_head sibling; struct list_head children; + /* flush target list anchored at cgrp->rstat_css_list */ + struct list_head rstat_css_node; + /* * PI: Subsys-unique ID. 0 is unused and root is always 1. The * matching css can be looked up using css_from_id(). @@ -412,6 +415,7 @@ struct cgroup { /* per-cpu recursive resource statistics */ struct cgroup_rstat_cpu __percpu *rstat_cpu; + struct list_head rstat_css_list; /* cgroup basic resource statistics */ struct cgroup_base_stat pending_bstat; /* pending from children */ @@ -577,6 +581,7 @@ struct cgroup_subsys { void (*css_released)(struct cgroup_subsys_state *css); void (*css_free)(struct cgroup_subsys_state *css); void (*css_reset)(struct cgroup_subsys_state *css); + void (*css_rstat_flush)(struct cgroup_subsys_state *css, int cpu); int (*css_extra_stat_show)(struct seq_file *seq, struct cgroup_subsys_state *css); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 31af98996692..04b7e7fad31a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1860,6 +1860,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) cgrp->dom_cgrp = cgrp; cgrp->max_descendants = INT_MAX; cgrp->max_depth = INT_MAX; + INIT_LIST_HEAD(&cgrp->rstat_css_list); prev_cputime_init(&cgrp->prev_cputime); for_each_subsys(ss, ssid) @@ -4621,6 +4622,11 @@ static void css_release_work_fn(struct work_struct *work) if (ss) { /* css release path */ + if (!list_empty(&css->rstat_css_node)) { + cgroup_rstat_flush(cgrp); + list_del_rcu(&css->rstat_css_node); + } + cgroup_idr_replace(&ss->css_idr, NULL, css->id); if (ss->css_released) ss->css_released(css); @@ -4682,6 +4688,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); + INIT_LIST_HEAD(&css->rstat_css_node); css->serial_nr = css_serial_nr_next++; atomic_set(&css->online_cnt, 0); @@ -4690,6 +4697,9 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css_get(css->parent); } + if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush) + list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list); + BUG_ON(cgroup_css(cgrp, ss)); } @@ -4791,6 +4801,7 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, err_list_del: list_del_rcu(&css->sibling); err_free_css: + list_del_rcu(&css->rstat_css_node); INIT_RCU_WORK(&css->destroy_rwork, css_free_rwork_fn); queue_rcu_work(cgroup_destroy_wq, &css->destroy_rwork); return ERR_PTR(err); diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 3386fb251a9e..339366e257d4 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -145,8 +145,17 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) struct cgroup *pos = NULL; raw_spin_lock(cpu_lock); - while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) + while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { + struct cgroup_subsys_state *css; + cgroup_base_stat_flush(pos, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(css, &pos->rstat_css_list, + rstat_css_node) + css->ss->css_rstat_flush(css, cpu); + rcu_read_unlock(); + } raw_spin_unlock(cpu_lock); /* if @may_sleep, play nice and yield if necessary */ -- cgit v1.2.3-59-g8ed1b From 9a9e97b2f1f27ec70e3c138a790bd5627747d868 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:05 -0700 Subject: cgroup: Add memory barriers to plug cgroup_rstat_updated() race window cgroup_rstat_updated() has a small race window where an updated signaling can race with flush and could be lost till the next update. This wasn't a problem for the existing usages, but we plan to use rstat to track counters which need to be accurate. This patch plugs the race window by synchronizing cgroup_rstat_updated() and flush path with memory barriers around cgroup_rstat_cpu->updated_next pointer. Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 339366e257d4..a4901e20735a 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -28,9 +28,12 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) unsigned long flags; /* - * Speculative already-on-list test. This may race leading to - * temporary inaccuracies, which is fine. - * + * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we + * see NULL updated_next or they see our updated stat. + */ + smp_mb(); + + /* * Because @parent's updated_children is terminated with @parent * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. @@ -126,6 +129,13 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, *nextp = rstatc->updated_next; rstatc->updated_next = NULL; + + /* + * Paired with the one in cgroup_rstat_cpu_updated(). + * Either they see NULL updated_next or we see their + * updated stat. + */ + smp_mb(); } return pos; -- cgit v1.2.3-59-g8ed1b From c43c5ea75fd53cae3c8b63e1a27efc4cff6508d3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 26 Apr 2018 14:29:06 -0700 Subject: cgroup: Make cgroup_rstat_updated() ready for root cgroup usage cgroup_rstat_updated() ensures that the cgroup's rstat is linked to the parent. If there's no parent, it never gets linked and the function ends up grabbing and releasing the cgroup_rstat_lock each time for no reason which can be expensive. This hasn't been a problem till now because nobody was calling the function for the root cgroup but rstat is gonna be exposed to controllers and use cases, so let's get ready. Make cgroup_rstat_updated() an no-op for the root cgroup. Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index a4901e20735a..d503d1a9007c 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -27,6 +27,10 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) struct cgroup *parent; unsigned long flags; + /* nothing to do for root */ + if (!cgroup_parent(cgrp)) + return; + /* * Paired with the one in cgroup_rstat_cpu_pop_upated(). Either we * see NULL updated_next or they see our updated stat. -- cgit v1.2.3-59-g8ed1b From cc659e76f375ab158fb682c1e39b2a22bf9f7657 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 4 May 2018 17:42:16 +0300 Subject: rdmacg: Convert to use match_string() helper The new helper returns index of the matching string in an array. We are going to use it here. Signed-off-by: Andy Shevchenko Signed-off-by: Tejun Heo --- kernel/cgroup/rdma.c | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index defad3c5e7dc..d3bbb757ee49 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -362,35 +362,32 @@ EXPORT_SYMBOL(rdmacg_unregister_device); static int parse_resource(char *c, int *intval) { substring_t argstr; - const char **table = &rdmacg_resource_names[0]; char *name, *value = c; size_t len; - int ret, i = 0; + int ret, i; name = strsep(&value, "="); if (!name || !value) return -EINVAL; - len = strlen(value); + i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name); + if (i < 0) + return i; - for (i = 0; i < RDMACG_RESOURCE_MAX; i++) { - if (strcmp(table[i], name)) - continue; + len = strlen(value); - argstr.from = value; - argstr.to = value + len; + argstr.from = value; + argstr.to = value + len; - ret = match_int(&argstr, intval); - if (ret >= 0) { - if (*intval < 0) - break; - return i; - } - if (strncmp(value, RDMACG_MAX_STR, len) == 0) { - *intval = S32_MAX; - return i; - } - break; + ret = match_int(&argstr, intval); + if (ret >= 0) { + if (*intval < 0) + return -EINVAL; + return i; + } + if (strncmp(value, RDMACG_MAX_STR, len) == 0) { + *intval = S32_MAX; + return i; } return -EINVAL; } -- cgit v1.2.3-59-g8ed1b From d8742e22902186e30c346b1ba881cb52942ae3e4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 23 May 2018 11:04:54 -0700 Subject: cgroup: css_set_lock should nest inside tasklist_lock cgroup_enable_task_cg_lists() incorrectly nests non-irq-safe tasklist_lock inside irq-safe css_set_lock triggering the following lockdep warning. WARNING: possible irq lock inversion dependency detected 4.17.0-rc1-00027-gb37d049 #6 Not tainted -------------------------------------------------------- systemd/1 just changed the state of lock: 00000000fe57773b (css_set_lock){..-.}, at: cgroup_free+0xf2/0x12a but this lock took another, SOFTIRQ-unsafe lock in the past: (tasklist_lock){.+.+} and interrupts could create inverse lock ordering between them. other info that might help us debug this: Possible interrupt unsafe locking scenario: CPU0 CPU1 ---- ---- lock(tasklist_lock); local_irq_disable(); lock(css_set_lock); lock(tasklist_lock); lock(css_set_lock); *** DEADLOCK *** The condition is highly unlikely to actually happen especially given that the path is executed only once per boot. Signed-off-by: Tejun Heo Reported-by: Boqun Feng --- kernel/cgroup/cgroup.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 04b7e7fad31a..63989cb44566 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -1798,13 +1798,6 @@ static void cgroup_enable_task_cg_lists(void) { struct task_struct *p, *g; - spin_lock_irq(&css_set_lock); - - if (use_task_css_set_links) - goto out_unlock; - - use_task_css_set_links = true; - /* * We need tasklist_lock because RCU is not safe against * while_each_thread(). Besides, a forking task that has passed @@ -1813,6 +1806,13 @@ static void cgroup_enable_task_cg_lists(void) * tasklist if we walk through it with RCU. */ read_lock(&tasklist_lock); + spin_lock_irq(&css_set_lock); + + if (use_task_css_set_links) + goto out_unlock; + + use_task_css_set_links = true; + do_each_thread(g, p) { WARN_ON_ONCE(!list_empty(&p->cg_list) || task_css_set(p) != &init_css_set); @@ -1840,9 +1840,9 @@ static void cgroup_enable_task_cg_lists(void) } spin_unlock(&p->sighand->siglock); } while_each_thread(g, p); - read_unlock(&tasklist_lock); out_unlock: spin_unlock_irq(&css_set_lock); + read_unlock(&tasklist_lock); } static void init_cgroup_housekeeping(struct cgroup *cgrp) -- cgit v1.2.3-59-g8ed1b