diff options
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r-- | kernel/cgroup/cgroup.c | 173 |
1 files changed, 108 insertions, 65 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 1779ccddb734..ffaccd6373f1 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -279,8 +279,6 @@ bool cgroup_ssid_enabled(int ssid) * * - When mounting an existing superblock, mount options should match. * - * - Remount is disallowed. - * * - rename(2) is disallowed. * * - "tasks" is removed. Everything should be at process granularity. Use @@ -765,7 +763,8 @@ struct css_set init_css_set = { .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), - .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node), + .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node), + .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node), .mg_node = LIST_HEAD_INIT(init_css_set.mg_node), /* @@ -1240,7 +1239,8 @@ static struct css_set *find_css_set(struct css_set *old_cset, INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); INIT_LIST_HEAD(&cset->cgrp_links); - INIT_LIST_HEAD(&cset->mg_preload_node); + INIT_LIST_HEAD(&cset->mg_src_preload_node); + INIT_LIST_HEAD(&cset->mg_dst_preload_node); INIT_LIST_HEAD(&cset->mg_node); /* Copy the set of subsystem state objects generated in @@ -1307,6 +1307,20 @@ struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root) return root_cgrp->root; } +void cgroup_favor_dynmods(struct cgroup_root *root, bool favor) +{ + bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS; + + /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */ + if (favor && !favoring) { + rcu_sync_enter(&cgroup_threadgroup_rwsem.rss); + root->flags |= CGRP_ROOT_FAVOR_DYNMODS; + } else if (!favor && favoring) { + rcu_sync_exit(&cgroup_threadgroup_rwsem.rss); + root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS; + } +} + static int cgroup_init_root_id(struct cgroup_root *root) { int id; @@ -1367,6 +1381,7 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_root_count--; } + cgroup_favor_dynmods(root, false); cgroup_exit_root_id(root); mutex_unlock(&cgroup_mutex); @@ -1376,6 +1391,31 @@ static void cgroup_destroy_root(struct cgroup_root *root) cgroup_free_root(root); } +static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset, + struct cgroup_root *root) +{ + struct cgroup *res_cgroup = NULL; + + if (cset == &init_css_set) { + res_cgroup = &root->cgrp; + } else if (root == &cgrp_dfl_root) { + res_cgroup = cset->dfl_cgrp; + } else { + struct cgrp_cset_link *link; + + list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { + struct cgroup *c = link->cgrp; + + if (c->root == root) { + res_cgroup = c; + break; + } + } + } + + return res_cgroup; +} + /* * look up cgroup associated with current task's cgroup namespace on the * specified hierarchy @@ -1391,22 +1431,8 @@ current_cgns_cgroup_from_root(struct cgroup_root *root) rcu_read_lock(); cset = current->nsproxy->cgroup_ns->root_cset; - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; + res = __cset_cgroup_from_root(cset, root); - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - if (c->root == root) { - res = c; - break; - } - } - } rcu_read_unlock(); BUG_ON(!res); @@ -1422,22 +1448,7 @@ static struct cgroup *cset_cgroup_from_root(struct css_set *cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); - if (cset == &init_css_set) { - res = &root->cgrp; - } else if (root == &cgrp_dfl_root) { - res = cset->dfl_cgrp; - } else { - struct cgrp_cset_link *link; - - list_for_each_entry(link, &cset->cgrp_links, cgrp_link) { - struct cgroup *c = link->cgrp; - - if (c->root == root) { - res = c; - break; - } - } - } + res = __cset_cgroup_from_root(cset, root); BUG_ON(!res); return res; @@ -1864,6 +1875,7 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node, enum cgroup2_param { Opt_nsdelegate, + Opt_favordynmods, Opt_memory_localevents, Opt_memory_recursiveprot, nr__cgroup2_params @@ -1871,6 +1883,7 @@ enum cgroup2_param { static const struct fs_parameter_spec cgroup2_fs_parameters[] = { fsparam_flag("nsdelegate", Opt_nsdelegate), + fsparam_flag("favordynmods", Opt_favordynmods), fsparam_flag("memory_localevents", Opt_memory_localevents), fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot), {} @@ -1890,6 +1903,9 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param case Opt_nsdelegate: ctx->flags |= CGRP_ROOT_NS_DELEGATE; return 0; + case Opt_favordynmods: + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; + return 0; case Opt_memory_localevents: ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; return 0; @@ -1908,6 +1924,9 @@ static void apply_cgroup_root_flags(unsigned int root_flags) else cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE; + cgroup_favor_dynmods(&cgrp_dfl_root, + root_flags & CGRP_ROOT_FAVOR_DYNMODS); + if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS; else @@ -1924,6 +1943,8 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root { if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE) seq_puts(seq, ",nsdelegate"); + if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS) + seq_puts(seq, ",favordynmods"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) seq_puts(seq, ",memory_localevents"); if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT) @@ -1974,7 +1995,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx) cgrp->root = root; init_cgroup_housekeeping(cgrp); - root->flags = ctx->flags; + /* DYNMODS must be modified through cgroup_favor_dynmods() */ + root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS; if (ctx->release_agent) strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX); if (ctx->name) @@ -2196,6 +2218,10 @@ static int cgroup_init_fs_context(struct fs_context *fc) put_user_ns(fc->user_ns); fc->user_ns = get_user_ns(ctx->ns->user_ns); fc->global = true; + +#ifdef CONFIG_CGROUP_FAVOR_DYNMODS + ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS; +#endif return 0; } @@ -2570,10 +2596,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp)) return -EOPNOTSUPP; - /* mixables don't care */ - if (cgroup_is_mixable(dst_cgrp)) - return 0; - /* * If @dst_cgrp is already or can become a thread root or is * threaded, it doesn't matter. @@ -2597,21 +2619,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp) */ void cgroup_migrate_finish(struct cgroup_mgctx *mgctx) { - LIST_HEAD(preloaded); struct css_set *cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); spin_lock_irq(&css_set_lock); - list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded); - list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded); + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets, + mg_src_preload_node) { + cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; + cset->mg_dst_cset = NULL; + list_del_init(&cset->mg_src_preload_node); + put_css_set_locked(cset); + } - list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) { + list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets, + mg_dst_preload_node) { cset->mg_src_cgrp = NULL; cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; - list_del_init(&cset->mg_preload_node); + list_del_init(&cset->mg_dst_preload_node); put_css_set_locked(cset); } @@ -2651,7 +2679,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - if (!list_empty(&src_cset->mg_preload_node)) + if (!list_empty(&src_cset->mg_src_preload_node)) return; src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); @@ -2664,7 +2692,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset, src_cset->mg_src_cgrp = src_cgrp; src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); - list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets); + list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets); } /** @@ -2689,7 +2717,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets, - mg_preload_node) { + mg_src_preload_node) { struct css_set *dst_cset; struct cgroup_subsys *ss; int ssid; @@ -2708,7 +2736,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; src_cset->mg_dst_cgrp = NULL; - list_del_init(&src_cset->mg_preload_node); + list_del_init(&src_cset->mg_src_preload_node); put_css_set(src_cset); put_css_set(dst_cset); continue; @@ -2716,8 +2744,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx) src_cset->mg_dst_cset = dst_cset; - if (list_empty(&dst_cset->mg_preload_node)) - list_add_tail(&dst_cset->mg_preload_node, + if (list_empty(&dst_cset->mg_dst_preload_node)) + list_add_tail(&dst_cset->mg_dst_preload_node, &mgctx->preloaded_dst_csets); else put_css_set(dst_cset); @@ -2941,29 +2969,48 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) struct cgroup_subsys_state *d_css; struct cgroup *dsct; struct css_set *src_cset; + bool has_tasks; int ret; lockdep_assert_held(&cgroup_mutex); - percpu_down_write(&cgroup_threadgroup_rwsem); - /* look up all csses currently attached to @cgrp's subtree */ spin_lock_irq(&css_set_lock); cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; + /* + * As cgroup_update_dfl_csses() is only called by + * cgroup_apply_control(). The csses associated with the + * given cgrp will not be affected by changes made to + * its subtree_control file. We can skip them. + */ + if (dsct == cgrp) + continue; + list_for_each_entry(link, &dsct->cset_links, cset_link) cgroup_migrate_add_src(link->cset, dsct, &mgctx); } spin_unlock_irq(&css_set_lock); + /* + * We need to write-lock threadgroup_rwsem while migrating tasks. + * However, if there are no source csets for @cgrp, changing its + * controllers isn't gonna produce any task migrations and the + * write-locking can be skipped safely. + */ + has_tasks = !list_empty(&mgctx.preloaded_src_csets); + if (has_tasks) + percpu_down_write(&cgroup_threadgroup_rwsem); + /* NULL dst indicates self on default hierarchy */ ret = cgroup_migrate_prepare_dst(&mgctx); if (ret) goto out_finish; spin_lock_irq(&css_set_lock); - list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) { + list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, + mg_src_preload_node) { struct task_struct *task, *ntask; /* all tasks in src_csets need to be migrated */ @@ -2975,7 +3022,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) ret = cgroup_migrate_execute(&mgctx); out_finish: cgroup_migrate_finish(&mgctx); - percpu_up_write(&cgroup_threadgroup_rwsem); + if (has_tasks) + percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } @@ -3609,21 +3657,21 @@ static int cpu_stat_show(struct seq_file *seq, void *v) static int cgroup_io_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_IO); } static int cgroup_memory_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_MEM); } static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; return psi_show(seq, psi, PSI_CPU); } @@ -3649,7 +3697,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return -EBUSY; } - psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi; + psi = cgroup_ino(cgrp) == 1 ? &psi_system : cgrp->psi; new = psi_trigger_create(psi, buf, nbytes, res); if (IS_ERR(new)) { cgroup_put(cgrp); @@ -5842,12 +5890,6 @@ int __init cgroup_init(void) cgroup_rstat_boot(); - /* - * The latency of the synchronize_rcu() is too high for cgroups, - * avoid it at the cost of forcing all readers into the slow path. - */ - rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss); - get_user_ns(init_cgroup_ns.user_ns); mutex_lock(&cgroup_mutex); @@ -6759,6 +6801,7 @@ static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr, { return snprintf(buf, PAGE_SIZE, "nsdelegate\n" + "favordynmods\n" "memory_localevents\n" "memory_recursiveprot\n"); } |