diff options
| author | 2019-07-20 07:07:56 +0300 | |
|---|---|---|
| committer | 2019-07-20 07:07:56 +0300 | |
| commit | c39f2d9db0fd81ea20bb5cce9b3f082ca63753e2 (patch) | |
| tree | 8e80ed5601b4fb8880a2ca8e08802bc8b1f850bd /kernel/cgroup | |
| parent | Merge branch 'next' into for-linus (diff) | |
| parent | Input: alps - fix a mismatch between a condition check and its comment (diff) | |
Merge branch 'next' into for-linus
Prepare second round of input updates for 5.3 merge window.
Diffstat (limited to 'kernel/cgroup')
| -rw-r--r-- | kernel/cgroup/Makefile | 4 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-internal.h | 8 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup-v1.c | 17 | ||||
| -rw-r--r-- | kernel/cgroup/cgroup.c | 363 | ||||
| -rw-r--r-- | kernel/cgroup/cpuset.c | 26 | ||||
| -rw-r--r-- | kernel/cgroup/debug.c | 8 | ||||
| -rw-r--r-- | kernel/cgroup/freezer.c | 635 | ||||
| -rw-r--r-- | kernel/cgroup/legacy_freezer.c | 481 | ||||
| -rw-r--r-- | kernel/cgroup/pids.c | 5 | ||||
| -rw-r--r-- | kernel/cgroup/rdma.c | 5 | ||||
| -rw-r--r-- | kernel/cgroup/rstat.c | 1 |
11 files changed, 1065 insertions, 488 deletions
diff --git a/kernel/cgroup/Makefile b/kernel/cgroup/Makefile index bfcdae896122..5d7a76bfbbb7 100644 --- a/kernel/cgroup/Makefile +++ b/kernel/cgroup/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 -obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o +obj-y := cgroup.o rstat.o namespace.o cgroup-v1.o freezer.o -obj-$(CONFIG_CGROUP_FREEZER) += freezer.o +obj-$(CONFIG_CGROUP_FREEZER) += legacy_freezer.o obj-$(CONFIG_CGROUP_PIDS) += pids.o obj-$(CONFIG_CGROUP_RDMA) += rdma.o obj-$(CONFIG_CPUSETS) += cpuset.o diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 30e39f3932ad..809e34a3c017 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -28,12 +28,15 @@ extern void __init enable_debug_cgroup(void); #define TRACE_CGROUP_PATH(type, cgrp, ...) \ do { \ if (trace_cgroup_##type##_enabled()) { \ - spin_lock(&trace_cgroup_path_lock); \ + unsigned long flags; \ + spin_lock_irqsave(&trace_cgroup_path_lock, \ + flags); \ cgroup_path(cgrp, trace_cgroup_path, \ TRACE_CGROUP_PATH_LEN); \ trace_cgroup_##type(cgrp, trace_cgroup_path, \ ##__VA_ARGS__); \ - spin_unlock(&trace_cgroup_path_lock); \ + spin_unlock_irqrestore(&trace_cgroup_path_lock, \ + flags); \ } \ } while (0) @@ -240,6 +243,7 @@ int cgroup_rmdir(struct kernfs_node *kn); int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, struct kernfs_root *kf_root); +int __cgroup_task_count(const struct cgroup *cgrp); int cgroup_task_count(const struct cgroup *cgrp); /* diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index c126b34fd4ff..88006be40ea3 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/ctype.h> @@ -342,22 +343,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find_create(struct cgroup *cgrp, return l; } -/** - * cgroup_task_count - count the number of tasks in a cgroup. - * @cgrp: the cgroup in question - */ -int cgroup_task_count(const struct cgroup *cgrp) -{ - int count = 0; - struct cgrp_cset_link *link; - - spin_lock_irq(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += link->cset->nr_tasks; - spin_unlock_irq(&css_set_lock); - return count; -} - /* * Load a cgroup's pidarray with either procs' tgids or tasks' pids */ diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3f2b4bde0f9c..bf9dbffd46b1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -593,6 +594,39 @@ static void cgroup_get_live(struct cgroup *cgrp) css_get(&cgrp->self); } +/** + * __cgroup_task_count - count the number of tasks in a cgroup. The caller + * is responsible for taking the css_set_lock. + * @cgrp: the cgroup in question + */ +int __cgroup_task_count(const struct cgroup *cgrp) +{ + int count = 0; + struct cgrp_cset_link *link; + + lockdep_assert_held(&css_set_lock); + + list_for_each_entry(link, &cgrp->cset_links, cset_link) + count += link->cset->nr_tasks; + + return count; +} + +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + */ +int cgroup_task_count(const struct cgroup *cgrp) +{ + int count; + + spin_lock_irq(&css_set_lock); + count = __cgroup_task_count(cgrp); + spin_unlock_irq(&css_set_lock); + + return count; +} + struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -705,6 +739,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -783,6 +818,8 @@ static void cgroup_update_populated(struct cgroup *cgrp, bool populated) break; cgroup1_check_for_release(cgrp); + TRACE_CGROUP_PATH(notify_populated, cgrp, + cgroup_is_populated(cgrp)); cgroup_file_notify(&cgrp->events_file); child = cgrp; @@ -808,6 +845,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -833,22 +885,9 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* - * @task is leaving, advance task iterators which are - * pointing to it so that they can resume at the next - * position. Advancing an iterator might remove it from - * the list, use safe walk. See css_task_iter_advance*() - * for details. - */ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, - iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -1175,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -1775,11 +1815,13 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_memory_localevents, nr__cgroup2_params }; static const struct fs_parameter_spec cgroup2_param_specs[] = { - fsparam_flag ("nsdelegate", Opt_nsdelegate), + fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("memory_localevents", Opt_memory_localevents), {} }; @@ -1802,6 +1844,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_memory_localevents: + ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + return 0; } return -EINVAL; } @@ -1813,6 +1858,11 @@ static void apply_cgroup_root_flags(unsigned int root_flags) cgrp_dfl_root.flags |= CGRP_ROOT_NS_DELEGATE; else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; + else + cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS; } } @@ -1820,6 +1870,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) + seq_puts(seq, ",memory_localevents"); return 0; } @@ -2402,8 +2454,15 @@ static int cgroup_migrate_execute(struct cgroup_mgctx *mgctx) get_css_set(to_cset); to_cset->nr_tasks++; css_set_move_task(task, from_cset, to_cset, true); - put_css_set_locked(from_cset); from_cset->nr_tasks--; + /* + * If the source or destination cgroup is frozen, + * the task might require to change its state. + */ + cgroup_freezer_migrate_task(task, from_cset->dfl_cgrp, + to_cset->dfl_cgrp); + put_css_set_locked(from_cset); + } } spin_unlock_irq(&css_set_lock); @@ -2602,7 +2661,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) - goto err; + return -ENOMEM; WARN_ON_ONCE(src_cset->mg_dst_cset || dst_cset->mg_dst_cset); @@ -2634,9 +2693,6 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) } return 0; -err: - cgroup_migrate_finish(mgctx); - return -ENOMEM; } /** @@ -3447,8 +3503,11 @@ static ssize_t cgroup_max_depth_write(struct kernfs_open_file *of, static int cgroup_events_show(struct seq_file *seq, void *v) { - seq_printf(seq, "populated %d\n", - cgroup_is_populated(seq_css(seq)->cgroup)); + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "populated %d\n", cgroup_is_populated(cgrp)); + seq_printf(seq, "frozen %d\n", test_bit(CGRP_FROZEN, &cgrp->flags)); + return 0; } @@ -3498,17 +3557,118 @@ static int cpu_stat_show(struct seq_file *seq, void *v) #ifdef CONFIG_PSI static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_IO); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_MEM); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { - return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU); + struct cgroup *cgroup = seq_css(seq)->cgroup; + struct psi_group *psi = cgroup->id == 1 ? &psi_system : &cgroup->psi; + + return psi_show(seq, psi, PSI_CPU); +} + +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, enum psi_res res) +{ + struct psi_trigger *new; + struct cgroup *cgrp; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENODEV; + + cgroup_get(cgrp); + cgroup_kn_unlock(of->kn); + + new = psi_trigger_create(&cgrp->psi, buf, nbytes, res); + if (IS_ERR(new)) { + cgroup_put(cgrp); + return PTR_ERR(new); + } + + psi_trigger_replace(&of->priv, new); + + cgroup_put(cgrp); + + return nbytes; +} + +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_IO); +} + +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM); +} + +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU); +} + +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, + poll_table *pt) +{ + return psi_trigger_poll(&of->priv, of->file, pt); +} + +static void cgroup_pressure_release(struct kernfs_open_file *of) +{ + psi_trigger_replace(&of->priv, NULL); +} +#endif /* CONFIG_PSI */ + +static int cgroup_freeze_show(struct seq_file *seq, void *v) +{ + struct cgroup *cgrp = seq_css(seq)->cgroup; + + seq_printf(seq, "%d\n", cgrp->freezer.freeze); + + return 0; +} + +static ssize_t cgroup_freeze_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct cgroup *cgrp; + ssize_t ret; + int freeze; + + ret = kstrtoint(strstrip(buf), 0, &freeze); + if (ret) + return ret; + + if (freeze < 0 || freeze > 1) + return -ERANGE; + + cgrp = cgroup_kn_lock_live(of->kn, false); + if (!cgrp) + return -ENOENT; + + cgroup_freeze(cgrp, freeze); + + cgroup_kn_unlock(of->kn); + + return nbytes; } -#endif static int cgroup_file_open(struct kernfs_open_file *of) { @@ -4253,15 +4413,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4287,9 +4450,20 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) +{ + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} + static void css_task_iter_advance(struct css_task_iter *it) { - struct list_head *next; + struct task_struct *task; lockdep_assert_held(&css_set_lock); repeat: @@ -4299,25 +4473,40 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ - next = it->task_pos->next; - - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (it->flags & CSS_TASK_ITER_SKIPPED) + it->flags &= ~CSS_TASK_ITER_SKIPPED; + else + it->task_pos = it->task_pos->next; - if (next == it->mg_tasks_head) + if (it->task_pos == it->tasks_head) + it->task_pos = it->mg_tasks_head->next; + if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); - else - it->task_pos = next; } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -4373,6 +4562,10 @@ struct task_struct *css_task_iter_next(struct css_task_iter *it) spin_lock_irq(&css_set_lock); + /* @it may be half-advanced by skips, finish advancing */ + if (it->flags & CSS_TASK_ITER_SKIPPED) + css_task_iter_advance(it); + if (it->task_pos) { it->cur_task = list_entry(it->task_pos, struct task_struct, cg_list); @@ -4654,6 +4847,12 @@ static struct cftype cgroup_base_files[] = { .seq_show = cgroup_stat_show, }, { + .name = "cgroup.freeze", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cgroup_freeze_show, + .write = cgroup_freeze_write, + }, + { .name = "cpu.stat", .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cpu_stat_show, @@ -4661,20 +4860,26 @@ static struct cftype cgroup_base_files[] = { #ifdef CONFIG_PSI { .name = "io.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_io_pressure_show, + .write = cgroup_io_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "memory.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_memory_pressure_show, + .write = cgroup_memory_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, { .name = "cpu.pressure", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_cpu_pressure_show, + .write = cgroup_cpu_pressure_write, + .poll = cgroup_pressure_poll, + .release = cgroup_pressure_release, }, -#endif +#endif /* CONFIG_PSI */ { } /* terminate */ }; @@ -4781,9 +4986,11 @@ static void css_release_work_fn(struct work_struct *work) if (cgroup_on_dfl(cgrp)) cgroup_rstat_flush(cgrp); + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) tcgrp->nr_dying_descendants--; + spin_unlock_irq(&css_set_lock); cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); cgrp->id = -1; @@ -5001,12 +5208,31 @@ static struct cgroup *cgroup_create(struct cgroup *parent) if (ret) goto out_psi_free; + /* + * New cgroup inherits effective freeze counter, and + * if the parent has to be frozen, the child has too. + */ + cgrp->freezer.e_freeze = parent->freezer.e_freeze; + if (cgrp->freezer.e_freeze) + set_bit(CGRP_FROZEN, &cgrp->flags); + + spin_lock_irq(&css_set_lock); for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) { cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; - if (tcgrp != cgrp) + if (tcgrp != cgrp) { tcgrp->nr_descendants++; + + /* + * If the new cgroup is frozen, all ancestor cgroups + * get a new frozen descendant, but their state can't + * change because of this. + */ + if (cgrp->freezer.e_freeze) + tcgrp->freezer.nr_frozen_descendants++; + } } + spin_unlock_irq(&css_set_lock); if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5291,10 +5517,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) if (parent && cgroup_is_threaded(cgrp)) parent->nr_threaded_children--; + spin_lock_irq(&css_set_lock); for (tcgrp = cgroup_parent(cgrp); tcgrp; tcgrp = cgroup_parent(tcgrp)) { tcgrp->nr_descendants--; tcgrp->nr_dying_descendants++; + /* + * If the dying cgroup is frozen, decrease frozen descendants + * counters of ancestor cgroups. + */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + tcgrp->freezer.nr_frozen_descendants--; } + spin_unlock_irq(&css_set_lock); cgroup1_check_for_release(parent); @@ -5746,6 +5980,26 @@ void cgroup_post_fork(struct task_struct *child) cset->nr_tasks++; css_set_move_task(child, NULL, cset, false); } + + /* + * If the cgroup has to be frozen, the new task has too. + * Let's set the JOBCTL_TRAP_FREEZE jobctl bit to get + * the task into the frozen state. + */ + if (unlikely(cgroup_task_freeze(child))) { + spin_lock(&child->sighand->siglock); + WARN_ON_ONCE(child->frozen); + child->jobctl |= JOBCTL_TRAP_FREEZE; + spin_unlock(&child->sighand->siglock); + + /* + * Calling cgroup_update_frozen() isn't required here, + * because it will be called anyway a bit later + * from do_freezer_trap(). So we avoid cgroup's + * transient switch from the frozen state and back. + */ + } + spin_unlock_irq(&css_set_lock); } @@ -5793,7 +6047,13 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; + + WARN_ON_ONCE(cgroup_task_frozen(tsk)); + if (unlikely(cgroup_task_freeze(tsk))) + cgroup_update_frozen(task_dfl_cgroup(tsk)); + spin_unlock_irq(&css_set_lock); } else { get_css_set(cset); @@ -5813,6 +6073,13 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); + + if (use_task_css_set_links) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) @@ -6116,7 +6383,7 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate); static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return snprintf(buf, PAGE_SIZE, "nsdelegate\n"); + return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n"); } static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features); diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 4834c4214e9c..515525ff1cfd 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -740,11 +740,10 @@ static inline int nr_cpusets(void) * Must be called with cpuset_mutex held. * * The three key local variables below are: - * q - a linked-list queue of cpuset pointers, used to implement a - * top-down scan of all cpusets. This scan loads a pointer - * to each cpuset marked is_sched_load_balance into the - * array 'csa'. For our purposes, rebuilding the schedulers - * sched domains, we can ignore !is_sched_load_balance cpusets. + * cp - cpuset pointer, used (together with pos_css) to perform a + * top-down scan of all cpusets. For our purposes, rebuilding + * the schedulers sched domains, we can ignore !is_sched_load_ + * balance cpusets. * csa - (for CpuSet Array) Array of pointers to all the cpusets * that need to be load balanced, for convenient iterative * access by the subsequent code that finds the best partition, @@ -775,7 +774,7 @@ static inline int nr_cpusets(void) static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr **attributes) { - struct cpuset *cp; /* scans q */ + struct cpuset *cp; /* top-down scan of cpusets */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ @@ -3255,10 +3254,23 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) spin_unlock_irqrestore(&callback_lock, flags); } +/** + * cpuset_cpus_allowed_fallback - final fallback before complete catastrophe. + * @tsk: pointer to task_struct with which the scheduler is struggling + * + * Description: In the case that the scheduler cannot find an allowed cpu in + * tsk->cpus_allowed, we fall back to task_cs(tsk)->cpus_allowed. In legacy + * mode however, this value is the same as task_cs(tsk)->effective_cpus, + * which will not contain a sane cpumask during cases such as cpu hotplugging. + * This is the absolute last resort for the scheduler and it is only used if + * _every_ other avenue has been traveled. + **/ + void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { rcu_read_lock(); - do_set_cpus_allowed(tsk, task_cs(tsk)->effective_cpus); + do_set_cpus_allowed(tsk, is_in_v2_mode() ? + task_cs(tsk)->cpus_allowed : cpu_possible_mask); rcu_read_unlock(); /* diff --git a/kernel/cgroup/debug.c b/kernel/cgroup/debug.c index 5f1b87330bee..80aa3f027ac3 100644 --- a/kernel/cgroup/debug.c +++ b/kernel/cgroup/debug.c @@ -64,8 +64,8 @@ static int current_css_set_read(struct seq_file *seq, void *v) css = cset->subsys[ss->id]; if (!css) continue; - seq_printf(seq, "%2d: %-4s\t- %lx[%d]\n", ss->id, ss->name, - (unsigned long)css, css->id); + seq_printf(seq, "%2d: %-4s\t- %p[%d]\n", ss->id, ss->name, + css, css->id); } rcu_read_unlock(); spin_unlock_irq(&css_set_lock); @@ -224,8 +224,8 @@ static int cgroup_subsys_states_read(struct seq_file *seq, void *v) if (css->parent) snprintf(pbuf, sizeof(pbuf) - 1, " P=%d", css->parent->id); - seq_printf(seq, "%2d: %-4s\t- %lx[%d] %d%s\n", ss->id, ss->name, - (unsigned long)css, css->id, + seq_printf(seq, "%2d: %-4s\t- %p[%d] %d%s\n", ss->id, ss->name, + css, css->id, atomic_read(&css->online_cnt), pbuf); } diff --git a/kernel/cgroup/freezer.c b/kernel/cgroup/freezer.c index 08236798d173..8cf010680678 100644 --- a/kernel/cgroup/freezer.c +++ b/kernel/cgroup/freezer.c @@ -1,481 +1,314 @@ -/* - * cgroup_freezer.c - control group freezer subsystem - * - * Copyright IBM Corporation, 2007 - * - * Author : Cedric Le Goater <clg@fr.ibm.com> - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2.1 of the GNU Lesser General Public License - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - */ - -#include <linux/export.h> -#include <linux/slab.h> +//SPDX-License-Identifier: GPL-2.0 #include <linux/cgroup.h> -#include <linux/fs.h> -#include <linux/uaccess.h> -#include <linux/freezer.h> -#include <linux/seq_file.h> -#include <linux/mutex.h> - -/* - * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is - * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared - * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING - * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of - * its ancestors has FREEZING_SELF set. - */ -enum freezer_state_flags { - CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ - CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ - CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ - CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ +#include <linux/sched.h> +#include <linux/sched/task.h> +#include <linux/sched/signal.h> - /* mask for all FREEZING flags */ - CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, -}; +#include "cgroup-internal.h" -struct freezer { - struct cgroup_subsys_state css; - unsigned int state; -}; +#include <trace/events/cgroup.h> -static DEFINE_MUTEX(freezer_mutex); - -static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) -{ - return css ? container_of(css, struct freezer, css) : NULL; -} - -static inline struct freezer *task_freezer(struct task_struct *task) +/* + * Propagate the cgroup frozen state upwards by the cgroup tree. + */ +static void cgroup_propagate_frozen(struct cgroup *cgrp, bool frozen) { - return css_freezer(task_css(task, freezer_cgrp_id)); -} + int desc = 1; -static struct freezer *parent_freezer(struct freezer *freezer) -{ - return css_freezer(freezer->css.parent); + /* + * If the new state is frozen, some freezing ancestor cgroups may change + * their state too, depending on if all their descendants are frozen. + * + * Otherwise, all ancestor cgroups are forced into the non-frozen state. + */ + while ((cgrp = cgroup_parent(cgrp))) { + if (frozen) { + cgrp->freezer.nr_frozen_descendants += desc; + if (!test_bit(CGRP_FROZEN, &cgrp->flags) && + test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_descendants == + cgrp->nr_descendants) { + set_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 1); + desc++; + } + } else { + cgrp->freezer.nr_frozen_descendants -= desc; + if (test_bit(CGRP_FROZEN, &cgrp->flags)) { + clear_bit(CGRP_FROZEN, &cgrp->flags); + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, 0); + desc++; + } + } + } } -bool cgroup_freezing(struct task_struct *task) +/* + * Revisit the cgroup frozen state. + * Checks if the cgroup is really frozen and perform all state transitions. + */ +void cgroup_update_frozen(struct cgroup *cgrp) { - bool ret; + bool frozen; - rcu_read_lock(); - ret = task_freezer(task)->state & CGROUP_FREEZING; - rcu_read_unlock(); + lockdep_assert_held(&css_set_lock); - return ret; -} + /* + * If the cgroup has to be frozen (CGRP_FREEZE bit set), + * and all tasks are frozen and/or stopped, let's consider + * the cgroup frozen. Otherwise it's not frozen. + */ + frozen = test_bit(CGRP_FREEZE, &cgrp->flags) && + cgrp->freezer.nr_frozen_tasks == __cgroup_task_count(cgrp); -static const char *freezer_state_strs(unsigned int state) -{ - if (state & CGROUP_FROZEN) - return "FROZEN"; - if (state & CGROUP_FREEZING) - return "FREEZING"; - return "THAWED"; -}; + if (frozen) { + /* Already there? */ + if (test_bit(CGRP_FROZEN, &cgrp->flags)) + return; -static struct cgroup_subsys_state * -freezer_css_alloc(struct cgroup_subsys_state *parent_css) -{ - struct freezer *freezer; + set_bit(CGRP_FROZEN, &cgrp->flags); + } else { + /* Already there? */ + if (!test_bit(CGRP_FROZEN, &cgrp->flags)) + return; - freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); - if (!freezer) - return ERR_PTR(-ENOMEM); + clear_bit(CGRP_FROZEN, &cgrp->flags); + } + cgroup_file_notify(&cgrp->events_file); + TRACE_CGROUP_PATH(notify_frozen, cgrp, frozen); - return &freezer->css; + /* Update the state of ancestor cgroups. */ + cgroup_propagate_frozen(cgrp, frozen); } -/** - * freezer_css_online - commit creation of a freezer css - * @css: css being created - * - * We're committing to creation of @css. Mark it online and inherit - * parent's freezing state while holding both parent's and our - * freezer->lock. +/* + * Increment cgroup's nr_frozen_tasks. */ -static int freezer_css_online(struct cgroup_subsys_state *css) +static void cgroup_inc_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - struct freezer *parent = parent_freezer(freezer); - - mutex_lock(&freezer_mutex); - - freezer->state |= CGROUP_FREEZER_ONLINE; - - if (parent && (parent->state & CGROUP_FREEZING)) { - freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; - atomic_inc(&system_freezing_cnt); - } - - mutex_unlock(&freezer_mutex); - return 0; + cgrp->freezer.nr_frozen_tasks++; } -/** - * freezer_css_offline - initiate destruction of a freezer css - * @css: css being destroyed - * - * @css is going away. Mark it dead and decrement system_freezing_count if - * it was holding one. +/* + * Decrement cgroup's nr_frozen_tasks. */ -static void freezer_css_offline(struct cgroup_subsys_state *css) +static void cgroup_dec_frozen_cnt(struct cgroup *cgrp) { - struct freezer *freezer = css_freezer(css); - - mutex_lock(&freezer_mutex); - - if (freezer->state & CGROUP_FREEZING) - atomic_dec(&system_freezing_cnt); - - freezer->state = 0; - - mutex_unlock(&freezer_mutex); + cgrp->freezer.nr_frozen_tasks--; + WARN_ON_ONCE(cgrp->freezer.nr_frozen_tasks < 0); } -static void freezer_css_free(struct cgroup_subsys_state *css) +/* + * Enter frozen/stopped state, if not yet there. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. + */ +void cgroup_enter_frozen(void) { - kfree(css_freezer(css)); + struct cgroup *cgrp; + + if (current->frozen) + return; + + spin_lock_irq(&css_set_lock); + current->frozen = true; + cgrp = task_dfl_cgroup(current); + cgroup_inc_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } /* - * Tasks can be migrated into a different freezer anytime regardless of its - * current state. freezer_attach() is responsible for making new tasks - * conform to the current state. + * Conditionally leave frozen/stopped state. Update cgroup's counters, + * and revisit the state of the cgroup, if necessary. * - * Freezer state changes and task migration are synchronized via - * @freezer->lock. freezer_attach() makes the new tasks conform to the - * current state and all following state changes can see the new tasks. + * If always_leave is not set, and the cgroup is freezing, + * we're racing with the cgroup freezing. In this case, we don't + * drop the frozen counter to avoid a transient switch to + * the unfrozen state. */ -static void freezer_attach(struct cgroup_taskset *tset) +void cgroup_leave_frozen(bool always_leave) { - struct task_struct *task; - struct cgroup_subsys_state *new_css; - - mutex_lock(&freezer_mutex); + struct cgroup *cgrp; - /* - * Make the new tasks conform to the current state of @new_css. - * For simplicity, when migrating any task to a FROZEN cgroup, we - * revert it to FREEZING and let update_if_frozen() determine the - * correct state later. - * - * Tasks in @tset are on @new_css but may not conform to its - * current state before executing the following - !frozen tasks may - * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. - */ - cgroup_taskset_for_each(task, new_css, tset) { - struct freezer *freezer = css_freezer(new_css); - - if (!(freezer->state & CGROUP_FREEZING)) { - __thaw_task(task); - } else { - freeze_task(task); - /* clear FROZEN and propagate upwards */ - while (freezer && (freezer->state & CGROUP_FROZEN)) { - freezer->state &= ~CGROUP_FROZEN; - freezer = parent_freezer(freezer); - } - } + spin_lock_irq(&css_set_lock); + cgrp = task_dfl_cgroup(current); + if (always_leave || !test_bit(CGRP_FREEZE, &cgrp->flags)) { + cgroup_dec_frozen_cnt(cgrp); + cgroup_update_frozen(cgrp); + WARN_ON_ONCE(!current->frozen); + current->frozen = false; + } else if (!(current->jobctl & JOBCTL_TRAP_FREEZE)) { + spin_lock(¤t->sighand->siglock); + current->jobctl |= JOBCTL_TRAP_FREEZE; + set_thread_flag(TIF_SIGPENDING); + spin_unlock(¤t->sighand->siglock); } - - mutex_unlock(&freezer_mutex); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_fork - cgroup post fork callback - * @task: a task which has just been forked - * - * @task has just been created and should conform to the current state of - * the cgroup_freezer it belongs to. This function may race against - * freezer_attach(). Losing to freezer_attach() means that we don't have - * to do anything as freezer_attach() will put @task into the appropriate - * state. +/* + * Freeze or unfreeze the task by setting or clearing the JOBCTL_TRAP_FREEZE + * jobctl bit. */ -static void freezer_fork(struct task_struct *task) +static void cgroup_freeze_task(struct task_struct *task, bool freeze) { - struct freezer *freezer; + unsigned long flags; - /* - * The root cgroup is non-freezable, so we can skip locking the - * freezer. This is safe regardless of race with task migration. - * If we didn't race or won, skipping is obviously the right thing - * to do. If we lost and root is the new cgroup, noop is still the - * right thing to do. - */ - if (task_css_is_root(task, freezer_cgrp_id)) + /* If the task is about to die, don't bother with freezing it. */ + if (!lock_task_sighand(task, &flags)) return; - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - freezer = task_freezer(task); - if (freezer->state & CGROUP_FREEZING) - freeze_task(task); + if (freeze) { + task->jobctl |= JOBCTL_TRAP_FREEZE; + signal_wake_up(task, false); + } else { + task->jobctl &= ~JOBCTL_TRAP_FREEZE; + wake_up_process(task); + } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); + unlock_task_sighand(task, &flags); } -/** - * update_if_frozen - update whether a cgroup finished freezing - * @css: css of interest - * - * Once FREEZING is initiated, transition to FROZEN is lazily updated by - * calling this function. If the current state is FREEZING but not FROZEN, - * this function checks whether all tasks of this cgroup and the descendant - * cgroups finished freezing and, if so, sets FROZEN. - * - * The caller is responsible for grabbing RCU read lock and calling - * update_if_frozen() on all descendants prior to invoking this function. - * - * Task states and freezer state might disagree while tasks are being - * migrated into or out of @css, so we can't verify task states against - * @freezer state here. See freezer_attach() for details. +/* + * Freeze or unfreeze all tasks in the given cgroup. */ -static void update_if_frozen(struct cgroup_subsys_state *css) +static void cgroup_do_freeze(struct cgroup *cgrp, bool freeze) { - struct freezer *freezer = css_freezer(css); - struct cgroup_subsys_state *pos; struct css_task_iter it; struct task_struct *task; - lockdep_assert_held(&freezer_mutex); - - if (!(freezer->state & CGROUP_FREEZING) || - (freezer->state & CGROUP_FROZEN)) - return; - - /* are all (live) children frozen? */ - rcu_read_lock(); - css_for_each_child(pos, css) { - struct freezer *child = css_freezer(pos); + lockdep_assert_held(&cgroup_mutex); - if ((child->state & CGROUP_FREEZER_ONLINE) && - !(child->state & CGROUP_FROZEN)) { - rcu_read_unlock(); - return; - } - } - rcu_read_unlock(); + spin_lock_irq(&css_set_lock); + if (freeze) + set_bit(CGRP_FREEZE, &cgrp->flags); + else + clear_bit(CGRP_FREEZE, &cgrp->flags); + spin_unlock_irq(&css_set_lock); - /* are all tasks frozen? */ - css_task_iter_start(css, 0, &it); + if (freeze) + TRACE_CGROUP_PATH(freeze, cgrp); + else + TRACE_CGROUP_PATH(unfreeze, cgrp); + css_task_iter_start(&cgrp->self, 0, &it); while ((task = css_task_iter_next(&it))) { - if (freezing(task)) { - /* - * freezer_should_skip() indicates that the task - * should be skipped when determining freezing - * completion. Consider it frozen in addition to - * the usual frozen condition. - */ - if (!frozen(task) && !freezer_should_skip(task)) - goto out_iter_end; - } - } - - freezer->state |= CGROUP_FROZEN; -out_iter_end: - css_task_iter_end(&it); -} - -static int freezer_read(struct seq_file *m, void *v) -{ - struct cgroup_subsys_state *css = seq_css(m), *pos; - - mutex_lock(&freezer_mutex); - rcu_read_lock(); - - /* update states bottom-up */ - css_for_each_descendant_post(pos, css) { - if (!css_tryget_online(pos)) + /* + * Ignore kernel threads here. Freezing cgroups containing + * kthreads isn't supported. + */ + if (task->flags & PF_KTHREAD) continue; - rcu_read_unlock(); - - update_if_frozen(pos); - - rcu_read_lock(); - css_put(pos); + cgroup_freeze_task(task, freeze); } - - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); - - seq_puts(m, freezer_state_strs(css_freezer(css)->state)); - seq_putc(m, '\n'); - return 0; -} - -static void freeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - freeze_task(task); css_task_iter_end(&it); -} - -static void unfreeze_cgroup(struct freezer *freezer) -{ - struct css_task_iter it; - struct task_struct *task; - css_task_iter_start(&freezer->css, 0, &it); - while ((task = css_task_iter_next(&it))) - __thaw_task(task); - css_task_iter_end(&it); + /* + * Cgroup state should be revisited here to cover empty leaf cgroups + * and cgroups which descendants are already in the desired state. + */ + spin_lock_irq(&css_set_lock); + if (cgrp->nr_descendants == cgrp->freezer.nr_frozen_descendants) + cgroup_update_frozen(cgrp); + spin_unlock_irq(&css_set_lock); } -/** - * freezer_apply_state - apply state change to a single cgroup_freezer - * @freezer: freezer to apply state change to - * @freeze: whether to freeze or unfreeze - * @state: CGROUP_FREEZING_* flag to set or clear - * - * Set or clear @state on @cgroup according to @freeze, and perform - * freezing or thawing as necessary. +/* + * Adjust the task state (freeze or unfreeze) and revisit the state of + * source and destination cgroups. */ -static void freezer_apply_state(struct freezer *freezer, bool freeze, - unsigned int state) +void cgroup_freezer_migrate_task(struct task_struct *task, + struct cgroup *src, struct cgroup *dst) { - /* also synchronizes against task migration, see freezer_attach() */ - lockdep_assert_held(&freezer_mutex); + lockdep_assert_held(&css_set_lock); - if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + /* + * Kernel threads are not supposed to be frozen at all. + */ + if (task->flags & PF_KTHREAD) return; - if (freeze) { - if (!(freezer->state & CGROUP_FREEZING)) - atomic_inc(&system_freezing_cnt); - freezer->state |= state; - freeze_cgroup(freezer); - } else { - bool was_freezing = freezer->state & CGROUP_FREEZING; - - freezer->state &= ~state; - - if (!(freezer->state & CGROUP_FREEZING)) { - if (was_freezing) - atomic_dec(&system_freezing_cnt); - freezer->state &= ~CGROUP_FROZEN; - unfreeze_cgroup(freezer); - } + /* + * Adjust counters of freezing and frozen tasks. + * Note, that if the task is frozen, but the destination cgroup is not + * frozen, we bump both counters to keep them balanced. + */ + if (task->frozen) { + cgroup_inc_frozen_cnt(dst); + cgroup_dec_frozen_cnt(src); } -} - -/** - * freezer_change_state - change the freezing state of a cgroup_freezer - * @freezer: freezer of interest - * @freeze: whether to freeze or thaw - * - * Freeze or thaw @freezer according to @freeze. The operations are - * recursive - all descendants of @freezer will be affected. - */ -static void freezer_change_state(struct freezer *freezer, bool freeze) -{ - struct cgroup_subsys_state *pos; + cgroup_update_frozen(dst); + cgroup_update_frozen(src); /* - * Update all its descendants in pre-order traversal. Each - * descendant will try to inherit its parent's FREEZING state as - * CGROUP_FREEZING_PARENT. + * Force the task to the desired state. */ - mutex_lock(&freezer_mutex); - rcu_read_lock(); - css_for_each_descendant_pre(pos, &freezer->css) { - struct freezer *pos_f = css_freezer(pos); - struct freezer *parent = parent_freezer(pos_f); - - if (!css_tryget_online(pos)) - continue; - rcu_read_unlock(); - - if (pos_f == freezer) - freezer_apply_state(pos_f, freeze, - CGROUP_FREEZING_SELF); - else - freezer_apply_state(pos_f, - parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); - - rcu_read_lock(); - css_put(pos); - } - rcu_read_unlock(); - mutex_unlock(&freezer_mutex); + cgroup_freeze_task(task, test_bit(CGRP_FREEZE, &dst->flags)); } -static ssize_t freezer_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) +void cgroup_freeze(struct cgroup *cgrp, bool freeze) { - bool freeze; + struct cgroup_subsys_state *css; + struct cgroup *dsct; + bool applied = false; - buf = strstrip(buf); + lockdep_assert_held(&cgroup_mutex); - if (strcmp(buf, freezer_state_strs(0)) == 0) - freeze = false; - else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) - freeze = true; - else - return -EINVAL; + /* + * Nothing changed? Just exit. + */ + if (cgrp->freezer.freeze == freeze) + return; - freezer_change_state(css_freezer(of_css(of)), freeze); - return nbytes; -} + cgrp->freezer.freeze = freeze; -static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); + /* + * Propagate changes downwards the cgroup tree. + */ + css_for_each_descendant_pre(css, &cgrp->self) { + dsct = css->cgroup; - return (bool)(freezer->state & CGROUP_FREEZING_SELF); -} + if (cgroup_is_dead(dsct)) + continue; -static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, - struct cftype *cft) -{ - struct freezer *freezer = css_freezer(css); + if (freeze) { + dsct->freezer.e_freeze++; + /* + * Already frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 1) + continue; + } else { + dsct->freezer.e_freeze--; + /* + * Still frozen because of ancestor's settings? + */ + if (dsct->freezer.e_freeze > 0) + continue; - return (bool)(freezer->state & CGROUP_FREEZING_PARENT); -} + WARN_ON_ONCE(dsct->freezer.e_freeze < 0); + } -static struct cftype files[] = { - { - .name = "state", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = freezer_read, - .write = freezer_write, - }, - { - .name = "self_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_self_freezing_read, - }, - { - .name = "parent_freezing", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = freezer_parent_freezing_read, - }, - { } /* terminate */ -}; + /* + * Do change actual state: freeze or unfreeze. + */ + cgroup_do_freeze(dsct, freeze); + applied = true; + } -struct cgroup_subsys freezer_cgrp_subsys = { - .css_alloc = freezer_css_alloc, - .css_online = freezer_css_online, - .css_offline = freezer_css_offline, - .css_free = freezer_css_free, - .attach = freezer_attach, - .fork = freezer_fork, - .legacy_cftypes = files, -}; + /* + * Even if the actual state hasn't changed, let's notify a user. + * The state can be enforced by an ancestor cgroup: the cgroup + * can already be in the desired state or it can be locked in the + * opposite state, so that the transition will never happen. + * In both cases it's better to notify a user, that there is + * nothing to wait for. + */ + if (!applied) { + TRACE_CGROUP_PATH(notify_frozen, cgrp, + test_bit(CGRP_FROZEN, &cgrp->flags)); + cgroup_file_notify(&cgrp->events_file); + } +} diff --git a/kernel/cgroup/legacy_freezer.c b/kernel/cgroup/legacy_freezer.c new file mode 100644 index 000000000000..08236798d173 --- /dev/null +++ b/kernel/cgroup/legacy_freezer.c @@ -0,0 +1,481 @@ +/* + * cgroup_freezer.c - control group freezer subsystem + * + * Copyright IBM Corporation, 2007 + * + * Author : Cedric Le Goater <clg@fr.ibm.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/cgroup.h> +#include <linux/fs.h> +#include <linux/uaccess.h> +#include <linux/freezer.h> +#include <linux/seq_file.h> +#include <linux/mutex.h> + +/* + * A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is + * set if "FROZEN" is written to freezer.state cgroupfs file, and cleared + * for "THAWED". FREEZING_PARENT is set if the parent freezer is FREEZING + * for whatever reason. IOW, a cgroup has FREEZING_PARENT set if one of + * its ancestors has FREEZING_SELF set. + */ +enum freezer_state_flags { + CGROUP_FREEZER_ONLINE = (1 << 0), /* freezer is fully online */ + CGROUP_FREEZING_SELF = (1 << 1), /* this freezer is freezing */ + CGROUP_FREEZING_PARENT = (1 << 2), /* the parent freezer is freezing */ + CGROUP_FROZEN = (1 << 3), /* this and its descendants frozen */ + + /* mask for all FREEZING flags */ + CGROUP_FREEZING = CGROUP_FREEZING_SELF | CGROUP_FREEZING_PARENT, +}; + +struct freezer { + struct cgroup_subsys_state css; + unsigned int state; +}; + +static DEFINE_MUTEX(freezer_mutex); + +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + +static inline struct freezer *task_freezer(struct task_struct *task) +{ + return css_freezer(task_css(task, freezer_cgrp_id)); +} + +static struct freezer *parent_freezer(struct freezer *freezer) +{ + return css_freezer(freezer->css.parent); +} + +bool cgroup_freezing(struct task_struct *task) +{ + bool ret; + + rcu_read_lock(); + ret = task_freezer(task)->state & CGROUP_FREEZING; + rcu_read_unlock(); + + return ret; +} + +static const char *freezer_state_strs(unsigned int state) +{ + if (state & CGROUP_FROZEN) + return "FROZEN"; + if (state & CGROUP_FREEZING) + return "FREEZING"; + return "THAWED"; +}; + +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct freezer *freezer; + + freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL); + if (!freezer) + return ERR_PTR(-ENOMEM); + + return &freezer->css; +} + +/** + * freezer_css_online - commit creation of a freezer css + * @css: css being created + * + * We're committing to creation of @css. Mark it online and inherit + * parent's freezing state while holding both parent's and our + * freezer->lock. + */ +static int freezer_css_online(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct freezer *parent = parent_freezer(freezer); + + mutex_lock(&freezer_mutex); + + freezer->state |= CGROUP_FREEZER_ONLINE; + + if (parent && (parent->state & CGROUP_FREEZING)) { + freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN; + atomic_inc(&system_freezing_cnt); + } + + mutex_unlock(&freezer_mutex); + return 0; +} + +/** + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed + * + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. + */ +static void freezer_css_offline(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + + mutex_lock(&freezer_mutex); + + if (freezer->state & CGROUP_FREEZING) + atomic_dec(&system_freezing_cnt); + + freezer->state = 0; + + mutex_unlock(&freezer_mutex); +} + +static void freezer_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_freezer(css)); +} + +/* + * Tasks can be migrated into a different freezer anytime regardless of its + * current state. freezer_attach() is responsible for making new tasks + * conform to the current state. + * + * Freezer state changes and task migration are synchronized via + * @freezer->lock. freezer_attach() makes the new tasks conform to the + * current state and all following state changes can see the new tasks. + */ +static void freezer_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *new_css; + + mutex_lock(&freezer_mutex); + + /* + * Make the new tasks conform to the current state of @new_css. + * For simplicity, when migrating any task to a FROZEN cgroup, we + * revert it to FREEZING and let update_if_frozen() determine the + * correct state later. + * + * Tasks in @tset are on @new_css but may not conform to its + * current state before executing the following - !frozen tasks may + * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. + */ + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + + if (!(freezer->state & CGROUP_FREEZING)) { + __thaw_task(task); + } else { + freeze_task(task); + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } + } + } + + mutex_unlock(&freezer_mutex); +} + +/** + * freezer_fork - cgroup post fork callback + * @task: a task which has just been forked + * + * @task has just been created and should conform to the current state of + * the cgroup_freezer it belongs to. This function may race against + * freezer_attach(). Losing to freezer_attach() means that we don't have + * to do anything as freezer_attach() will put @task into the appropriate + * state. + */ +static void freezer_fork(struct task_struct *task) +{ + struct freezer *freezer; + + /* + * The root cgroup is non-freezable, so we can skip locking the + * freezer. This is safe regardless of race with task migration. + * If we didn't race or won, skipping is obviously the right thing + * to do. If we lost and root is the new cgroup, noop is still the + * right thing to do. + */ + if (task_css_is_root(task, freezer_cgrp_id)) + return; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + freezer = task_freezer(task); + if (freezer->state & CGROUP_FREEZING) + freeze_task(task); + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +/** + * update_if_frozen - update whether a cgroup finished freezing + * @css: css of interest + * + * Once FREEZING is initiated, transition to FROZEN is lazily updated by + * calling this function. If the current state is FREEZING but not FROZEN, + * this function checks whether all tasks of this cgroup and the descendant + * cgroups finished freezing and, if so, sets FROZEN. + * + * The caller is responsible for grabbing RCU read lock and calling + * update_if_frozen() on all descendants prior to invoking this function. + * + * Task states and freezer state might disagree while tasks are being + * migrated into or out of @css, so we can't verify task states against + * @freezer state here. See freezer_attach() for details. + */ +static void update_if_frozen(struct cgroup_subsys_state *css) +{ + struct freezer *freezer = css_freezer(css); + struct cgroup_subsys_state *pos; + struct css_task_iter it; + struct task_struct *task; + + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZING) || + (freezer->state & CGROUP_FROZEN)) + return; + + /* are all (live) children frozen? */ + rcu_read_lock(); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); + + if ((child->state & CGROUP_FREEZER_ONLINE) && + !(child->state & CGROUP_FROZEN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + + /* are all tasks frozen? */ + css_task_iter_start(css, 0, &it); + + while ((task = css_task_iter_next(&it))) { + if (freezing(task)) { + /* + * freezer_should_skip() indicates that the task + * should be skipped when determining freezing + * completion. Consider it frozen in addition to + * the usual frozen condition. + */ + if (!frozen(task) && !freezer_should_skip(task)) + goto out_iter_end; + } + } + + freezer->state |= CGROUP_FROZEN; +out_iter_end: + css_task_iter_end(&it); +} + +static int freezer_read(struct seq_file *m, void *v) +{ + struct cgroup_subsys_state *css = seq_css(m), *pos; + + mutex_lock(&freezer_mutex); + rcu_read_lock(); + + /* update states bottom-up */ + css_for_each_descendant_post(pos, css) { + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + update_if_frozen(pos); + + rcu_read_lock(); + css_put(pos); + } + + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); + + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); + seq_putc(m, '\n'); + return 0; +} + +static void freeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + freeze_task(task); + css_task_iter_end(&it); +} + +static void unfreeze_cgroup(struct freezer *freezer) +{ + struct css_task_iter it; + struct task_struct *task; + + css_task_iter_start(&freezer->css, 0, &it); + while ((task = css_task_iter_next(&it))) + __thaw_task(task); + css_task_iter_end(&it); +} + +/** + * freezer_apply_state - apply state change to a single cgroup_freezer + * @freezer: freezer to apply state change to + * @freeze: whether to freeze or unfreeze + * @state: CGROUP_FREEZING_* flag to set or clear + * + * Set or clear @state on @cgroup according to @freeze, and perform + * freezing or thawing as necessary. + */ +static void freezer_apply_state(struct freezer *freezer, bool freeze, + unsigned int state) +{ + /* also synchronizes against task migration, see freezer_attach() */ + lockdep_assert_held(&freezer_mutex); + + if (!(freezer->state & CGROUP_FREEZER_ONLINE)) + return; + + if (freeze) { + if (!(freezer->state & CGROUP_FREEZING)) + atomic_inc(&system_freezing_cnt); + freezer->state |= state; + freeze_cgroup(freezer); + } else { + bool was_freezing = freezer->state & CGROUP_FREEZING; + + freezer->state &= ~state; + + if (!(freezer->state & CGROUP_FREEZING)) { + if (was_freezing) + atomic_dec(&system_freezing_cnt); + freezer->state &= ~CGROUP_FROZEN; + unfreeze_cgroup(freezer); + } + } +} + +/** + * freezer_change_state - change the freezing state of a cgroup_freezer + * @freezer: freezer of interest + * @freeze: whether to freeze or thaw + * + * Freeze or thaw @freezer according to @freeze. The operations are + * recursive - all descendants of @freezer will be affected. + */ +static void freezer_change_state(struct freezer *freezer, bool freeze) +{ + struct cgroup_subsys_state *pos; + + /* + * Update all its descendants in pre-order traversal. Each + * descendant will try to inherit its parent's FREEZING state as + * CGROUP_FREEZING_PARENT. + */ + mutex_lock(&freezer_mutex); + rcu_read_lock(); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); + struct freezer *parent = parent_freezer(pos_f); + + if (!css_tryget_online(pos)) + continue; + rcu_read_unlock(); + + if (pos_f == freezer) + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + else + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + + rcu_read_lock(); + css_put(pos); + } + rcu_read_unlock(); + mutex_unlock(&freezer_mutex); +} + +static ssize_t freezer_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + bool freeze; + + buf = strstrip(buf); + + if (strcmp(buf, freezer_state_strs(0)) == 0) + freeze = false; + else if (strcmp(buf, freezer_state_strs(CGROUP_FROZEN)) == 0) + freeze = true; + else + return -EINVAL; + + freezer_change_state(css_freezer(of_css(of)), freeze); + return nbytes; +} + +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_SELF); +} + +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct freezer *freezer = css_freezer(css); + + return (bool)(freezer->state & CGROUP_FREEZING_PARENT); +} + +static struct cftype files[] = { + { + .name = "state", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = freezer_read, + .write = freezer_write, + }, + { + .name = "self_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_self_freezing_read, + }, + { + .name = "parent_freezing", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = freezer_parent_freezing_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys freezer_cgrp_subsys = { + .css_alloc = freezer_css_alloc, + .css_online = freezer_css_online, + .css_offline = freezer_css_offline, + .css_free = freezer_css_free, + .attach = freezer_attach, + .fork = freezer_fork, + .legacy_cftypes = files, +}; diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index c9960baaa14f..8e513a573fe9 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * Process number limiting controller for cgroups. * @@ -25,10 +26,6 @@ * a superset of parent/child/pids.current. * * Copyright (C) 2015 Aleksa Sarai <cyphar@cyphar.com> - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include <linux/kernel.h> diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index 1d75ae7f1cb7..ae042c347c64 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only /* * RDMA resource limiting controller for cgroups. * @@ -5,10 +6,6 @@ * additional RDMA resources after a certain limit is reached. * * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com> - * - * This file is subject to the terms and conditions of version 2 of the GNU - * General Public License. See the file COPYING in the main directory of the - * Linux distribution for more details. */ #include <linux/bitops.h> diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bb95a35e8c2d..ca19b4c8acf5 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -1,3 +1,4 @@ +// SPDX-License-Identifier: GPL-2.0-only #include "cgroup-internal.h" #include <linux/sched/cputime.h> |
