aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cgroup/cgroup.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cgroup/cgroup.c')
-rw-r--r--kernel/cgroup/cgroup.c1586
1 files changed, 1119 insertions, 467 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 3dead0416b91..2319946715e0 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -30,6 +30,7 @@
#include "cgroup-internal.h"
+#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@@ -68,6 +69,14 @@
#define CGROUP_FILE_NOTIFY_MIN_INTV DIV_ROUND_UP(HZ, 100)
/*
+ * To avoid confusing the compiler (and generating warnings) with code
+ * that attempts to access what would be a 0-element array (i.e. sized
+ * to a potentially empty array when CGROUP_SUBSYS_COUNT == 0), this
+ * constant expression can be added.
+ */
+#define CGROUP_HAS_SUBSYS_CONFIG (CGROUP_SUBSYS_COUNT > 0)
+
+/*
* cgroup_mutex is the master lock. Any modification to cgroup or its
* hierarchy must be performed while holding it.
*
@@ -87,7 +96,7 @@ EXPORT_SYMBOL_GPL(css_set_lock);
DEFINE_SPINLOCK(trace_cgroup_path_lock);
char trace_cgroup_path[TRACE_CGROUP_PATH_LEN];
-bool cgroup_debug __read_mostly;
+static bool cgroup_debug __read_mostly;
/*
* Protects cgroup_idr and css_idr so that IDs can be released without
@@ -153,11 +162,7 @@ static struct static_key_true *cgroup_subsys_on_dfl_key[] = {
static DEFINE_PER_CPU(struct cgroup_rstat_cpu, cgrp_dfl_root_rstat_cpu);
-/*
- * The default hierarchy, reserved for the subsystems that are otherwise
- * unattached - it never has more than a single cgroup, and all tasks are
- * part of that cgroup.
- */
+/* the default hierarchy */
struct cgroup_root cgrp_dfl_root = { .cgrp.rstat_cpu = &cgrp_dfl_root_rstat_cpu };
EXPORT_SYMBOL_GPL(cgrp_dfl_root);
@@ -203,7 +208,7 @@ static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
struct cgroup_namespace init_cgroup_ns = {
- .count = REFCOUNT_INIT(2),
+ .ns.count = REFCOUNT_INIT(2),
.user_ns = &init_user_ns,
.ns.ops = &cgroupns_operations,
.ns.inum = PROC_CGROUP_INIT_INO,
@@ -212,6 +217,23 @@ struct cgroup_namespace init_cgroup_ns = {
static struct file_system_type cgroup2_fs_type;
static struct cftype cgroup_base_files[];
+static struct cftype cgroup_psi_files[];
+
+/* cgroup optional features */
+enum cgroup_opt_features {
+#ifdef CONFIG_PSI
+ OPT_FEATURE_PRESSURE,
+#endif
+ OPT_FEATURE_COUNT
+};
+
+static const char *cgroup_opt_feature_names[OPT_FEATURE_COUNT] = {
+#ifdef CONFIG_PSI
+ "pressure",
+#endif
+};
+
+static u16 cgroup_feature_disable_mask __read_mostly;
static int cgroup_apply_control(struct cgroup *cgrp);
static void cgroup_finalize_control(struct cgroup *cgrp, int ret);
@@ -236,7 +258,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css,
*/
bool cgroup_ssid_enabled(int ssid)
{
- if (CGROUP_SUBSYS_COUNT == 0)
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
return false;
return static_key_enabled(cgroup_subsys_enabled_key[ssid]);
@@ -248,12 +270,9 @@ bool cgroup_ssid_enabled(int ssid)
*
* The default hierarchy is the v2 interface of cgroup and this function
* can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
+ * cases where a subsystem should behave differently depending on the
* interface version.
*
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
* List of changed behaviors:
*
* - Mount options "noprefix", "xattr", "clone_children", "release_agent"
@@ -261,15 +280,13 @@ bool cgroup_ssid_enabled(int ssid)
*
* - When mounting an existing superblock, mount options should match.
*
- * - Remount is disallowed.
- *
* - rename(2) is disallowed.
*
* - "tasks" is removed. Everything should be at process granularity. Use
* "cgroup.procs" instead.
*
* - "cgroup.procs" is not sorted. pids will be unique unless they got
- * recycled inbetween reads.
+ * recycled in-between reads.
*
* - "release_agent" and "notify_on_release" are removed. Replacement
* notification mechanism will be implemented.
@@ -288,9 +305,6 @@ bool cgroup_ssid_enabled(int ssid)
* - cpuset: a task can be moved into an empty cpuset, and again it takes
* masks of ancestors.
*
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- * is not created.
- *
* - blkcg: blk-throttle becomes properly hierarchical.
*
* - debug: disallowed on the default hierarchy.
@@ -352,7 +366,7 @@ static bool cgroup_is_mixable(struct cgroup *cgrp)
return !cgroup_parent(cgrp);
}
-/* can @cgrp become a thread root? should always be true for a thread root */
+/* can @cgrp become a thread root? Should always be true for a thread root */
static bool cgroup_can_be_thread_root(struct cgroup *cgrp)
{
/* mixables don't care */
@@ -466,7 +480,7 @@ static u16 cgroup_ss_mask(struct cgroup *cgrp)
static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
- if (ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && ss)
return rcu_dereference_check(cgrp->subsys[ss->id],
lockdep_is_held(&cgroup_mutex));
else
@@ -478,7 +492,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp,
* @cgrp: the cgroup of interest
* @ss: the subsystem of interest
*
- * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist
+ * Find and get @cgrp's css associated with @ss. If the css doesn't exist
* or is offline, %NULL is returned.
*/
static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp,
@@ -537,13 +551,16 @@ static struct cgroup_subsys_state *cgroup_e_css_by_mask(struct cgroup *cgrp,
* the root css is returned, so this function always returns a valid css.
*
* The returned css is not guaranteed to be online, and therefore it is the
- * callers responsiblity to tryget a reference for it.
+ * callers responsibility to try get a reference for it.
*/
struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp,
struct cgroup_subsys *ss)
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
do {
css = cgroup_css(cgrp, ss);
@@ -571,6 +588,9 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp,
{
struct cgroup_subsys_state *css;
+ if (!CGROUP_HAS_SUBSYS_CONFIG)
+ return NULL;
+
rcu_read_lock();
do {
@@ -587,6 +607,7 @@ out_unlock:
rcu_read_unlock();
return css;
}
+EXPORT_SYMBOL_GPL(cgroup_get_e_css);
static void cgroup_get_live(struct cgroup *cgrp)
{
@@ -640,7 +661,7 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of)
* the matching css from the cgroup's subsys table is guaranteed to
* be and stay valid until the enclosing operation is complete.
*/
- if (cft->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && cft->ss)
return rcu_dereference_raw(cgrp->subsys[cft->ss->id]);
else
return &cgrp->self;
@@ -688,7 +709,7 @@ EXPORT_SYMBOL_GPL(of_css);
*/
#define do_each_subsys_mask(ss, ssid, ss_mask) do { \
unsigned long __ss_mask = (ss_mask); \
- if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \
+ if (!CGROUP_HAS_SUBSYS_CONFIG) { \
(ssid) = 0; \
break; \
} \
@@ -709,7 +730,7 @@ EXPORT_SYMBOL_GPL(of_css);
; \
else
-/* walk live descendants in preorder */
+/* walk live descendants in pre order */
#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \
css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \
if (({ lockdep_assert_held(&cgroup_mutex); \
@@ -743,7 +764,8 @@ struct css_set init_css_set = {
.task_iters = LIST_HEAD_INIT(init_css_set.task_iters),
.threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
- .mg_preload_node = LIST_HEAD_INIT(init_css_set.mg_preload_node),
+ .mg_src_preload_node = LIST_HEAD_INIT(init_css_set.mg_src_preload_node),
+ .mg_dst_preload_node = LIST_HEAD_INIT(init_css_set.mg_dst_preload_node),
.mg_node = LIST_HEAD_INIT(init_css_set.mg_node),
/*
@@ -943,7 +965,7 @@ void put_css_set_locked(struct css_set *cset)
WARN_ON_ONCE(!list_empty(&cset->threaded_csets));
- /* This css_set is dead. unlink it and release cgroup and css refs */
+ /* This css_set is dead. Unlink it and release cgroup and css refs */
for_each_subsys(ss, ssid) {
list_del(&cset->e_cset_node[ssid]);
css_put(cset->subsys[ssid]);
@@ -1068,7 +1090,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset,
/*
* Build the set of subsystem state objects that we want to see in the
- * new css_set. while subsystems can change globally, the entries here
+ * new css_set. While subsystems can change globally, the entries here
* won't change, so no need for locking.
*/
for_each_subsys(ss, i) {
@@ -1158,7 +1180,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset,
/*
* Always add links to the tail of the lists so that the lists are
- * in choronological order.
+ * in chronological order.
*/
list_move_tail(&link->cset_link, &cgrp->cset_links);
list_add_tail(&link->cgrp_link, &cset->cgrp_links);
@@ -1218,7 +1240,8 @@ static struct css_set *find_css_set(struct css_set *old_cset,
INIT_LIST_HEAD(&cset->threaded_csets);
INIT_HLIST_NODE(&cset->hlist);
INIT_LIST_HEAD(&cset->cgrp_links);
- INIT_LIST_HEAD(&cset->mg_preload_node);
+ INIT_LIST_HEAD(&cset->mg_src_preload_node);
+ INIT_LIST_HEAD(&cset->mg_dst_preload_node);
INIT_LIST_HEAD(&cset->mg_node);
/* Copy the set of subsystem state objects generated in
@@ -1280,11 +1303,25 @@ static struct css_set *find_css_set(struct css_set *old_cset,
struct cgroup_root *cgroup_root_from_kf(struct kernfs_root *kf_root)
{
- struct cgroup *root_cgrp = kf_root->kn->priv;
+ struct cgroup *root_cgrp = kernfs_root_to_node(kf_root)->priv;
return root_cgrp->root;
}
+void cgroup_favor_dynmods(struct cgroup_root *root, bool favor)
+{
+ bool favoring = root->flags & CGRP_ROOT_FAVOR_DYNMODS;
+
+ /* see the comment above CGRP_ROOT_FAVOR_DYNMODS definition */
+ if (favor && !favoring) {
+ rcu_sync_enter(&cgroup_threadgroup_rwsem.rss);
+ root->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ } else if (!favor && favoring) {
+ rcu_sync_exit(&cgroup_threadgroup_rwsem.rss);
+ root->flags &= ~CGRP_ROOT_FAVOR_DYNMODS;
+ }
+}
+
static int cgroup_init_root_id(struct cgroup_root *root)
{
int id;
@@ -1345,82 +1382,98 @@ static void cgroup_destroy_root(struct cgroup_root *root)
cgroup_root_count--;
}
+ cgroup_favor_dynmods(root, false);
cgroup_exit_root_id(root);
mutex_unlock(&cgroup_mutex);
+ cgroup_rstat_exit(cgrp);
kernfs_destroy_root(root->kf_root);
cgroup_free_root(root);
}
/*
- * look up cgroup associated with current task's cgroup namespace on the
- * specified hierarchy
+ * Returned cgroup is without refcount but it's valid as long as cset pins it.
*/
-static struct cgroup *
-current_cgns_cgroup_from_root(struct cgroup_root *root)
+static inline struct cgroup *__cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
{
- struct cgroup *res = NULL;
- struct css_set *cset;
-
- lockdep_assert_held(&css_set_lock);
-
- rcu_read_lock();
+ struct cgroup *res_cgroup = NULL;
- cset = current->nsproxy->cgroup_ns->root_cset;
if (cset == &init_css_set) {
- res = &root->cgrp;
+ res_cgroup = &root->cgrp;
} else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
+ res_cgroup = cset->dfl_cgrp;
} else {
struct cgrp_cset_link *link;
+ lockdep_assert_held(&css_set_lock);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
if (c->root == root) {
- res = c;
+ res_cgroup = c;
break;
}
}
}
- rcu_read_unlock();
- BUG_ON(!res);
- return res;
+ BUG_ON(!res_cgroup);
+ return res_cgroup;
}
-/* look up cgroup associated with given css_set on the specified hierarchy */
-static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
- struct cgroup_root *root)
+/*
+ * look up cgroup associated with current task's cgroup namespace on the
+ * specified hierarchy
+ */
+static struct cgroup *
+current_cgns_cgroup_from_root(struct cgroup_root *root)
{
struct cgroup *res = NULL;
+ struct css_set *cset;
- lockdep_assert_held(&cgroup_mutex);
lockdep_assert_held(&css_set_lock);
- if (cset == &init_css_set) {
- res = &root->cgrp;
- } else if (root == &cgrp_dfl_root) {
- res = cset->dfl_cgrp;
- } else {
- struct cgrp_cset_link *link;
+ rcu_read_lock();
- list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
- struct cgroup *c = link->cgrp;
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ res = __cset_cgroup_from_root(cset, root);
- if (c->root == root) {
- res = c;
- break;
- }
- }
- }
+ rcu_read_unlock();
- BUG_ON(!res);
return res;
}
/*
+ * Look up cgroup associated with current task's cgroup namespace on the default
+ * hierarchy.
+ *
+ * Unlike current_cgns_cgroup_from_root(), this doesn't need locks:
+ * - Internal rcu_read_lock is unnecessary because we don't dereference any rcu
+ * pointers.
+ * - css_set_lock is not needed because we just read cset->dfl_cgrp.
+ * - As a bonus returned cgrp is pinned with the current because it cannot
+ * switch cgroup_ns asynchronously.
+ */
+static struct cgroup *current_cgns_cgroup_dfl(void)
+{
+ struct css_set *cset;
+
+ cset = current->nsproxy->cgroup_ns->root_cset;
+ return __cset_cgroup_from_root(cset, &cgrp_dfl_root);
+}
+
+/* look up cgroup associated with given css_set on the specified hierarchy */
+static struct cgroup *cset_cgroup_from_root(struct css_set *cset,
+ struct cgroup_root *root)
+{
+ lockdep_assert_held(&cgroup_mutex);
+ lockdep_assert_held(&css_set_lock);
+
+ return __cset_cgroup_from_root(cset, root);
+}
+
+/*
* Return the cgroup for "task" from the given hierarchy. Must be
* called with cgroup_mutex and css_set_lock held.
*/
@@ -1642,7 +1695,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
/**
* css_clear_dir - remove subsys files in a cgroup directory
- * @css: taget css
+ * @css: target css
*/
static void css_clear_dir(struct cgroup_subsys_state *css)
{
@@ -1655,12 +1708,16 @@ static void css_clear_dir(struct cgroup_subsys_state *css)
css->flags &= ~CSS_VISIBLE;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- cgroup_addrm_files(css, cgrp, cfts, false);
+ if (cgroup_on_dfl(cgrp)) {
+ cgroup_addrm_files(css, cgrp,
+ cgroup_base_files, false);
+ if (cgroup_psi_enabled())
+ cgroup_addrm_files(css, cgrp,
+ cgroup_psi_files, false);
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, false);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node)
cgroup_addrm_files(css, cgrp, cfts, false);
@@ -1683,14 +1740,22 @@ static int css_populate_dir(struct cgroup_subsys_state *css)
return 0;
if (!css->ss) {
- if (cgroup_on_dfl(cgrp))
- cfts = cgroup_base_files;
- else
- cfts = cgroup1_base_files;
-
- ret = cgroup_addrm_files(&cgrp->self, cgrp, cfts, true);
- if (ret < 0)
- return ret;
+ if (cgroup_on_dfl(cgrp)) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_base_files, true);
+ if (ret < 0)
+ return ret;
+
+ if (cgroup_psi_enabled()) {
+ ret = cgroup_addrm_files(&cgrp->self, cgrp,
+ cgroup_psi_files, true);
+ if (ret < 0)
+ return ret;
+ }
+ } else {
+ cgroup_addrm_files(css, cgrp,
+ cgroup1_base_files, true);
+ }
} else {
list_for_each_entry(cfts, &css->ss->cfts, node) {
ret = cgroup_addrm_files(css, cgrp, cfts, true);
@@ -1718,6 +1783,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
struct cgroup *dcgrp = &dst_root->cgrp;
struct cgroup_subsys *ss;
int ssid, i, ret;
+ u16 dfl_disable_ss_mask = 0;
lockdep_assert_held(&cgroup_mutex);
@@ -1734,8 +1800,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
/* can't move between two non-dummy roots either */
if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
return -EBUSY;
+
+ /*
+ * Collect ssid's that need to be disabled from default
+ * hierarchy.
+ */
+ if (ss->root == &cgrp_dfl_root)
+ dfl_disable_ss_mask |= 1 << ssid;
+
} while_each_subsys_mask();
+ if (dfl_disable_ss_mask) {
+ struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+ /*
+ * Controllers from default hierarchy that need to be rebound
+ * are all disabled together in one go.
+ */
+ cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
+
do_each_subsys_mask(ss, ssid, ss_mask) {
struct cgroup_root *src_root = ss->root;
struct cgroup *scgrp = &src_root->cgrp;
@@ -1744,10 +1830,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
WARN_ON(!css || cgroup_css(dcgrp, ss));
- /* disable from the source */
- src_root->subsys_mask &= ~(1 << ssid);
- WARN_ON(cgroup_apply_control(scgrp));
- cgroup_finalize_control(scgrp, 0);
+ if (src_root != &cgrp_dfl_root) {
+ /* disable from the source */
+ src_root->subsys_mask &= ~(1 << ssid);
+ WARN_ON(cgroup_apply_control(scgrp));
+ cgroup_finalize_control(scgrp, 0);
+ }
/* rebind */
RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -1761,6 +1849,13 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
&dcgrp->e_csets[ss->id]);
spin_unlock_irq(&css_set_lock);
+ if (ss->css_rstat_flush) {
+ list_del_rcu(&css->rstat_css_node);
+ synchronize_rcu();
+ list_add_rcu(&css->rstat_css_node,
+ &dcgrp->rstat_css_list);
+ }
+
/* default hierarchy doesn't enable controllers by default */
dst_root->subsys_mask |= 1 << ssid;
if (dst_root == &cgrp_dfl_root) {
@@ -1812,13 +1907,17 @@ int cgroup_show_path(struct seq_file *sf, struct kernfs_node *kf_node,
enum cgroup2_param {
Opt_nsdelegate,
+ Opt_favordynmods,
Opt_memory_localevents,
+ Opt_memory_recursiveprot,
nr__cgroup2_params
};
static const struct fs_parameter_spec cgroup2_fs_parameters[] = {
fsparam_flag("nsdelegate", Opt_nsdelegate),
+ fsparam_flag("favordynmods", Opt_favordynmods),
fsparam_flag("memory_localevents", Opt_memory_localevents),
+ fsparam_flag("memory_recursiveprot", Opt_memory_recursiveprot),
{}
};
@@ -1836,9 +1935,15 @@ static int cgroup2_parse_param(struct fs_context *fc, struct fs_parameter *param
case Opt_nsdelegate:
ctx->flags |= CGRP_ROOT_NS_DELEGATE;
return 0;
+ case Opt_favordynmods:
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+ return 0;
case Opt_memory_localevents:
ctx->flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
return 0;
+ case Opt_memory_recursiveprot:
+ ctx->flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ return 0;
}
return -EINVAL;
}
@@ -1851,10 +1956,18 @@ static void apply_cgroup_root_flags(unsigned int root_flags)
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_NS_DELEGATE;
+ cgroup_favor_dynmods(&cgrp_dfl_root,
+ root_flags & CGRP_ROOT_FAVOR_DYNMODS);
+
if (root_flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_LOCAL_EVENTS;
else
cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_LOCAL_EVENTS;
+
+ if (root_flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ cgrp_dfl_root.flags |= CGRP_ROOT_MEMORY_RECURSIVE_PROT;
+ else
+ cgrp_dfl_root.flags &= ~CGRP_ROOT_MEMORY_RECURSIVE_PROT;
}
}
@@ -1862,8 +1975,12 @@ static int cgroup_show_options(struct seq_file *seq, struct kernfs_root *kf_root
{
if (cgrp_dfl_root.flags & CGRP_ROOT_NS_DELEGATE)
seq_puts(seq, ",nsdelegate");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_FAVOR_DYNMODS)
+ seq_puts(seq, ",favordynmods");
if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS)
seq_puts(seq, ",memory_localevents");
+ if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT)
+ seq_puts(seq, ",memory_recursiveprot");
return 0;
}
@@ -1910,7 +2027,8 @@ void init_cgroup_root(struct cgroup_fs_context *ctx)
cgrp->root = root;
init_cgroup_housekeeping(cgrp);
- root->flags = ctx->flags;
+ /* DYNMODS must be modified through cgroup_favor_dynmods() */
+ root->flags = ctx->flags & ~CGRP_ROOT_FAVOR_DYNMODS;
if (ctx->release_agent)
strscpy(root->release_agent_path, ctx->release_agent, PATH_MAX);
if (ctx->name)
@@ -1954,24 +2072,29 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
root->kf_root = kernfs_create_root(kf_sops,
KERNFS_ROOT_CREATE_DEACTIVATED |
- KERNFS_ROOT_SUPPORT_EXPORTOP,
+ KERNFS_ROOT_SUPPORT_EXPORTOP |
+ KERNFS_ROOT_SUPPORT_USER_XATTR,
root_cgrp);
if (IS_ERR(root->kf_root)) {
ret = PTR_ERR(root->kf_root);
goto exit_root_id;
}
- root_cgrp->kn = root->kf_root->kn;
+ root_cgrp->kn = kernfs_root_to_node(root->kf_root);
WARN_ON_ONCE(cgroup_ino(root_cgrp) != 1);
- root_cgrp->ancestor_ids[0] = cgroup_id(root_cgrp);
+ root_cgrp->ancestors[0] = root_cgrp;
ret = css_populate_dir(&root_cgrp->self);
if (ret)
goto destroy_root;
- ret = rebind_subsystems(root, ss_mask);
+ ret = cgroup_rstat_init(root_cgrp);
if (ret)
goto destroy_root;
+ ret = rebind_subsystems(root, ss_mask);
+ if (ret)
+ goto exit_stats;
+
ret = cgroup_bpf_inherit(root_cgrp);
WARN_ON_ONCE(ret);
@@ -2000,10 +2123,11 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
- kernfs_activate(root_cgrp->kn);
ret = 0;
goto out;
+exit_stats:
+ cgroup_rstat_exit(root_cgrp);
destroy_root:
kernfs_destroy_root(root->kf_root);
root->kf_root = NULL;
@@ -2080,7 +2204,7 @@ static int cgroup_get_tree(struct fs_context *fc)
struct cgroup_fs_context *ctx = cgroup_fc2context(fc);
int ret;
- cgrp_dfl_visible = true;
+ WRITE_ONCE(cgrp_dfl_visible, true);
cgroup_get_live(&cgrp_dfl_root.cgrp);
ctx->root = &cgrp_dfl_root;
@@ -2126,6 +2250,10 @@ static int cgroup_init_fs_context(struct fs_context *fc)
put_user_ns(fc->user_ns);
fc->user_ns = get_user_ns(ctx->ns->user_ns);
fc->global = true;
+
+#ifdef CONFIG_CGROUP_FAVOR_DYNMODS
+ ctx->flags |= CGRP_ROOT_FAVOR_DYNMODS;
+#endif
return 0;
}
@@ -2137,13 +2265,14 @@ static void cgroup_kill_sb(struct super_block *sb)
/*
* If @root doesn't have any children, start killing it.
* This prevents new mounts by disabling percpu_ref_tryget_live().
- * cgroup_mount() may wait for @root's release.
*
* And don't kill the default root.
*/
if (list_empty(&root->cgrp.self.children) && root != &cgrp_dfl_root &&
- !percpu_ref_is_dying(&root->cgrp.self.refcnt))
+ !percpu_ref_is_dying(&root->cgrp.self.refcnt)) {
+ cgroup_bpf_offline(&root->cgrp);
percpu_ref_kill(&root->cgrp.self.refcnt);
+ }
cgroup_put(&root->cgrp);
kernfs_kill_sb(sb);
}
@@ -2263,7 +2392,7 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
ret = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns);
} else {
/* if no hierarchy exists, everyone is in "/" */
- ret = strlcpy(buf, "/", buflen);
+ ret = strscpy(buf, "/", buflen);
}
spin_unlock_irq(&css_set_lock);
@@ -2273,6 +2402,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
EXPORT_SYMBOL_GPL(task_cgroup_path);
/**
+ * cgroup_attach_lock - Lock for ->attach()
+ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
+ *
+ * cgroup migration sometimes needs to stabilize threadgroups against forks and
+ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
+ * implementations (e.g. cpuset), also need to disable CPU hotplug.
+ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
+ * lead to deadlocks.
+ *
+ * Bringing up a CPU may involve creating and destroying tasks which requires
+ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
+ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
+ * waiting for an on-going CPU hotplug operation which in turn is waiting for
+ * the threadgroup_rwsem to be released to create new tasks. For more details:
+ *
+ * http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
+ *
+ * Resolve the situation by always acquiring cpus_read_lock() before optionally
+ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
+ * CPU hotplug is disabled on entry.
+ */
+void cgroup_attach_lock(bool lock_threadgroup)
+{
+ cpus_read_lock();
+ if (lock_threadgroup)
+ percpu_down_write(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_attach_unlock - Undo cgroup_attach_lock()
+ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
+ */
+void cgroup_attach_unlock(bool lock_threadgroup)
+{
+ if (lock_threadgroup)
+ percpu_up_write(&cgroup_threadgroup_rwsem);
+ cpus_read_unlock();
+}
+
+/**
* cgroup_migrate_add_task - add a migration target task to a migration context
* @task: target task
* @mgctx: target migration context
@@ -2341,7 +2511,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
struct css_set *cset = tset->cur_cset;
struct task_struct *task = tset->cur_task;
- while (&cset->mg_node != tset->csets) {
+ while (CGROUP_HAS_SUBSYS_CONFIG && &cset->mg_node != tset->csets) {
if (!task)
task = list_first_entry(&cset->mg_tasks,
struct task_struct, cg_list);
@@ -2374,7 +2544,7 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset,
}
/**
- * cgroup_taskset_migrate - migrate a taskset
+ * cgroup_migrate_execute - migrate a taskset
* @mgctx: migration context
*
* Migrate tasks in @mgctx as setup by migration preparation functions.
@@ -2499,10 +2669,6 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
if (!cgroup_is_valid_domain(dst_cgrp->dom_cgrp))
return -EOPNOTSUPP;
- /* mixables don't care */
- if (cgroup_is_mixable(dst_cgrp))
- return 0;
-
/*
* If @dst_cgrp is already or can become a thread root or is
* threaded, it doesn't matter.
@@ -2526,21 +2692,27 @@ int cgroup_migrate_vet_dst(struct cgroup *dst_cgrp)
*/
void cgroup_migrate_finish(struct cgroup_mgctx *mgctx)
{
- LIST_HEAD(preloaded);
struct css_set *cset, *tmp_cset;
lockdep_assert_held(&cgroup_mutex);
spin_lock_irq(&css_set_lock);
- list_splice_tail_init(&mgctx->preloaded_src_csets, &preloaded);
- list_splice_tail_init(&mgctx->preloaded_dst_csets, &preloaded);
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_src_csets,
+ mg_src_preload_node) {
+ cset->mg_src_cgrp = NULL;
+ cset->mg_dst_cgrp = NULL;
+ cset->mg_dst_cset = NULL;
+ list_del_init(&cset->mg_src_preload_node);
+ put_css_set_locked(cset);
+ }
- list_for_each_entry_safe(cset, tmp_cset, &preloaded, mg_preload_node) {
+ list_for_each_entry_safe(cset, tmp_cset, &mgctx->preloaded_dst_csets,
+ mg_dst_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cgrp = NULL;
cset->mg_dst_cset = NULL;
- list_del_init(&cset->mg_preload_node);
+ list_del_init(&cset->mg_dst_preload_node);
put_css_set_locked(cset);
}
@@ -2580,11 +2752,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
- src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
-
- if (!list_empty(&src_cset->mg_preload_node))
+ if (!list_empty(&src_cset->mg_src_preload_node))
return;
+ src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
+
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@@ -2593,7 +2765,7 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
src_cset->mg_src_cgrp = src_cgrp;
src_cset->mg_dst_cgrp = dst_cgrp;
get_css_set(src_cset);
- list_add_tail(&src_cset->mg_preload_node, &mgctx->preloaded_src_csets);
+ list_add_tail(&src_cset->mg_src_preload_node, &mgctx->preloaded_src_csets);
}
/**
@@ -2618,7 +2790,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
/* look up the dst cset for each src cset and link it to src */
list_for_each_entry_safe(src_cset, tmp_cset, &mgctx->preloaded_src_csets,
- mg_preload_node) {
+ mg_src_preload_node) {
struct css_set *dst_cset;
struct cgroup_subsys *ss;
int ssid;
@@ -2637,7 +2809,7 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
src_cset->mg_dst_cgrp = NULL;
- list_del_init(&src_cset->mg_preload_node);
+ list_del_init(&src_cset->mg_src_preload_node);
put_css_set(src_cset);
put_css_set(dst_cset);
continue;
@@ -2645,8 +2817,8 @@ int cgroup_migrate_prepare_dst(struct cgroup_mgctx *mgctx)
src_cset->mg_dst_cset = dst_cset;
- if (list_empty(&dst_cset->mg_preload_node))
- list_add_tail(&dst_cset->mg_preload_node,
+ if (list_empty(&dst_cset->mg_dst_preload_node))
+ list_add_tail(&dst_cset->mg_dst_preload_node,
&mgctx->preloaded_dst_csets);
else
put_css_set(dst_cset);
@@ -2714,11 +2886,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
{
DEFINE_CGROUP_MGCTX(mgctx);
struct task_struct *task;
- int ret;
-
- ret = cgroup_migrate_vet_dst(dst_cgrp);
- if (ret)
- return ret;
+ int ret = 0;
/* look up all src csets */
spin_lock_irq(&css_set_lock);
@@ -2746,8 +2914,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
}
struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
- bool *locked)
- __acquires(&cgroup_threadgroup_rwsem)
+ bool *threadgroup_locked)
{
struct task_struct *tsk;
pid_t pid;
@@ -2764,12 +2931,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
* Therefore, we can skip the global lock.
*/
lockdep_assert_held(&cgroup_mutex);
- if (pid || threadgroup) {
- percpu_down_write(&cgroup_threadgroup_rwsem);
- *locked = true;
- } else {
- *locked = false;
- }
+ *threadgroup_locked = pid || threadgroup;
+ cgroup_attach_lock(*threadgroup_locked);
rcu_read_lock();
if (pid) {
@@ -2800,17 +2963,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
goto out_unlock_rcu;
out_unlock_threadgroup:
- if (*locked) {
- percpu_up_write(&cgroup_threadgroup_rwsem);
- *locked = false;
- }
+ cgroup_attach_unlock(*threadgroup_locked);
+ *threadgroup_locked = false;
out_unlock_rcu:
rcu_read_unlock();
return tsk;
}
-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
- __releases(&cgroup_threadgroup_rwsem)
+void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
{
struct cgroup_subsys *ss;
int ssid;
@@ -2818,8 +2978,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
/* release reference from cgroup_procs_write_start() */
put_task_struct(task);
- if (locked)
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(threadgroup_locked);
+
for_each_subsys(ss, ssid)
if (ss->post_attach)
ss->post_attach();
@@ -2874,29 +3034,47 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
struct cgroup_subsys_state *d_css;
struct cgroup *dsct;
struct css_set *src_cset;
+ bool has_tasks;
int ret;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cgroup_threadgroup_rwsem);
-
/* look up all csses currently attached to @cgrp's subtree */
spin_lock_irq(&css_set_lock);
cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
struct cgrp_cset_link *link;
+ /*
+ * As cgroup_update_dfl_csses() is only called by
+ * cgroup_apply_control(). The csses associated with the
+ * given cgrp will not be affected by changes made to
+ * its subtree_control file. We can skip them.
+ */
+ if (dsct == cgrp)
+ continue;
+
list_for_each_entry(link, &dsct->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, dsct, &mgctx);
}
spin_unlock_irq(&css_set_lock);
+ /*
+ * We need to write-lock threadgroup_rwsem while migrating tasks.
+ * However, if there are no source csets for @cgrp, changing its
+ * controllers isn't gonna produce any task migrations and the
+ * write-locking can be skipped safely.
+ */
+ has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+ cgroup_attach_lock(has_tasks);
+
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(&mgctx);
if (ret)
goto out_finish;
spin_lock_irq(&css_set_lock);
- list_for_each_entry(src_cset, &mgctx.preloaded_src_csets, mg_preload_node) {
+ list_for_each_entry(src_cset, &mgctx.preloaded_src_csets,
+ mg_src_preload_node) {
struct task_struct *task, *ntask;
/* all tasks in src_csets need to be migrated */
@@ -2908,7 +3086,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
ret = cgroup_migrate_execute(&mgctx);
out_finish:
cgroup_migrate_finish(&mgctx);
- percpu_up_write(&cgroup_threadgroup_rwsem);
+ cgroup_attach_unlock(has_tasks);
return ret;
}
@@ -3145,11 +3323,7 @@ static int cgroup_apply_control(struct cgroup *cgrp)
* making the following cgroup_update_dfl_csses() properly update
* css associations of all tasks in the subtree.
*/
- ret = cgroup_update_dfl_csses(cgrp);
- if (ret)
- return ret;
-
- return 0;
+ return cgroup_update_dfl_csses(cgrp);
}
/**
@@ -3542,30 +3716,32 @@ static int cpu_stat_show(struct seq_file *seq, void *v)
static int cgroup_io_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_IO);
}
static int cgroup_memory_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_MEM);
}
static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
{
struct cgroup *cgrp = seq_css(seq)->cgroup;
- struct psi_group *psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
+ struct psi_group *psi = cgroup_psi(cgrp);
return psi_show(seq, psi, PSI_CPU);
}
-static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
- size_t nbytes, enum psi_res res)
+static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
+ struct psi_group *psi;
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
@@ -3574,14 +3750,20 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
- new = psi_trigger_create(&cgrp->psi, buf, nbytes, res);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
+ psi = cgroup_psi(cgrp);
+ new = psi_trigger_create(psi, buf, res);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3591,33 +3773,117 @@ static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
+ return pressure_write(of, buf, nbytes, PSI_IO);
}
static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
+ return pressure_write(of, buf, nbytes, PSI_MEM);
}
static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
char *buf, size_t nbytes,
loff_t off)
{
- return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
+ return pressure_write(of, buf, nbytes, PSI_CPU);
+}
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+static int cgroup_irq_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ return psi_show(seq, psi, PSI_IRQ);
+}
+
+static ssize_t cgroup_irq_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ return pressure_write(of, buf, nbytes, PSI_IRQ);
+}
+#endif
+
+static int cgroup_pressure_show(struct seq_file *seq, void *v)
+{
+ struct cgroup *cgrp = seq_css(seq)->cgroup;
+ struct psi_group *psi = cgroup_psi(cgrp);
+
+ seq_printf(seq, "%d\n", psi->enabled);
+
+ return 0;
+}
+
+static ssize_t cgroup_pressure_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ ssize_t ret;
+ int enable;
+ struct cgroup *cgrp;
+ struct psi_group *psi;
+
+ ret = kstrtoint(strstrip(buf), 0, &enable);
+ if (ret)
+ return ret;
+
+ if (enable < 0 || enable > 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ psi = cgroup_psi(cgrp);
+ if (psi->enabled != enable) {
+ int i;
+
+ /* show or hide {cpu,memory,io,irq}.pressure files */
+ for (i = 0; i < NR_PSI_RESOURCES; i++)
+ cgroup_file_show(&cgrp->psi_files[i], enable);
+
+ psi->enabled = enable;
+ if (enable)
+ psi_cgroup_restart(psi);
+ }
+
+ cgroup_kn_unlock(of->kn);
+
+ return nbytes;
}
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
}
+
+bool cgroup_psi_enabled(void)
+{
+ if (static_branch_likely(&psi_disabled))
+ return false;
+
+ return (cgroup_feature_disable_mask & (1 << OPT_FEATURE_PRESSURE)) == 0;
+}
+
+#else /* CONFIG_PSI */
+bool cgroup_psi_enabled(void)
+{
+ return false;
+}
+
#endif /* CONFIG_PSI */
static int cgroup_freeze_show(struct seq_file *seq, void *v)
@@ -3654,32 +3920,128 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
return nbytes;
}
+static void __cgroup_kill(struct cgroup *cgrp)
+{
+ struct css_task_iter it;
+ struct task_struct *task;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ spin_lock_irq(&css_set_lock);
+ set_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+
+ css_task_iter_start(&cgrp->self, CSS_TASK_ITER_PROCS | CSS_TASK_ITER_THREADED, &it);
+ while ((task = css_task_iter_next(&it))) {
+ /* Ignore kernel threads here. */
+ if (task->flags & PF_KTHREAD)
+ continue;
+
+ /* Skip tasks that are already dying. */
+ if (__fatal_signal_pending(task))
+ continue;
+
+ send_sig(SIGKILL, task, 0);
+ }
+ css_task_iter_end(&it);
+
+ spin_lock_irq(&css_set_lock);
+ clear_bit(CGRP_KILL, &cgrp->flags);
+ spin_unlock_irq(&css_set_lock);
+}
+
+static void cgroup_kill(struct cgroup *cgrp)
+{
+ struct cgroup_subsys_state *css;
+ struct cgroup *dsct;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ cgroup_for_each_live_descendant_pre(dsct, css, cgrp)
+ __cgroup_kill(dsct);
+}
+
+static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ ssize_t ret = 0;
+ int kill;
+ struct cgroup *cgrp;
+
+ ret = kstrtoint(strstrip(buf), 0, &kill);
+ if (ret)
+ return ret;
+
+ if (kill != 1)
+ return -ERANGE;
+
+ cgrp = cgroup_kn_lock_live(of->kn, false);
+ if (!cgrp)
+ return -ENOENT;
+
+ /*
+ * Killing is a process directed operation, i.e. the whole thread-group
+ * is taken down so act like we do for cgroup.procs and only make this
+ * writable in non-threaded cgroups.
+ */
+ if (cgroup_is_threaded(cgrp))
+ ret = -EOPNOTSUPP;
+ else
+ cgroup_kill(cgrp);
+
+ cgroup_kn_unlock(of->kn);
+
+ return ret ?: nbytes;
+}
+
static int cgroup_file_open(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
struct cgroup_subsys_state *css;
int ret;
+ if (!nbytes)
+ return 0;
+
/*
* If namespaces are delegation boundaries, disallow writes to
* files in an non-init namespace root from inside the namespace
@@ -3688,7 +4050,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -3723,7 +4085,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
static __poll_t cgroup_file_poll(struct kernfs_open_file *of, poll_table *pt)
{
- struct cftype *cft = of->kn->priv;
+ struct cftype *cft = of_cft(of);
if (cft->poll)
return cft->poll(of, pt);
@@ -3929,19 +4291,26 @@ static void cgroup_exit_cftypes(struct cftype *cfts)
cft->ss = NULL;
/* revert flags set by cgroup core while adding @cfts */
- cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL);
+ cft->flags &= ~(__CFTYPE_ONLY_ON_DFL | __CFTYPE_NOT_ON_DFL |
+ __CFTYPE_ADDED);
}
}
static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
{
struct cftype *cft;
+ int ret = 0;
for (cft = cfts; cft->name[0] != '\0'; cft++) {
struct kernfs_ops *kf_ops;
WARN_ON(cft->ss || cft->kf_ops);
+ if (cft->flags & __CFTYPE_ADDED) {
+ ret = -EBUSY;
+ break;
+ }
+
if (cft->seq_start)
kf_ops = &cgroup_kf_ops;
else
@@ -3954,26 +4323,26 @@ static int cgroup_init_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
if (cft->max_write_len && cft->max_write_len != PAGE_SIZE) {
kf_ops = kmemdup(kf_ops, sizeof(*kf_ops), GFP_KERNEL);
if (!kf_ops) {
- cgroup_exit_cftypes(cfts);
- return -ENOMEM;
+ ret = -ENOMEM;
+ break;
}
kf_ops->atomic_write_len = cft->max_write_len;
}
cft->kf_ops = kf_ops;
cft->ss = ss;
+ cft->flags |= __CFTYPE_ADDED;
}
- return 0;
+ if (ret)
+ cgroup_exit_cftypes(cfts);
+ return ret;
}
static int cgroup_rm_cftypes_locked(struct cftype *cfts)
{
lockdep_assert_held(&cgroup_mutex);
- if (!cfts || !cfts[0].ss)
- return -ENOENT;
-
list_del(&cfts->node);
cgroup_apply_cftypes(cfts, false);
cgroup_exit_cftypes(cfts);
@@ -3995,6 +4364,12 @@ int cgroup_rm_cftypes(struct cftype *cfts)
{
int ret;
+ if (!cfts || cfts[0].name[0] == '\0')
+ return 0;
+
+ if (!(cfts[0].flags & __CFTYPE_ADDED))
+ return -ENOENT;
+
mutex_lock(&cgroup_mutex);
ret = cgroup_rm_cftypes_locked(cfts);
mutex_unlock(&cgroup_mutex);
@@ -4100,6 +4475,26 @@ void cgroup_file_notify(struct cgroup_file *cfile)
}
/**
+ * cgroup_file_show - show or hide a hidden cgroup file
+ * @cfile: target cgroup_file obtained by setting cftype->file_offset
+ * @show: whether to show or hide
+ */
+void cgroup_file_show(struct cgroup_file *cfile, bool show)
+{
+ struct kernfs_node *kn;
+
+ spin_lock_irq(&cgroup_file_kn_lock);
+ kn = cfile->kn;
+ kernfs_get(kn);
+ spin_unlock_irq(&cgroup_file_kn_lock);
+
+ if (kn)
+ kernfs_show(kn, show);
+
+ kernfs_put(kn);
+}
+
+/**
* css_next_child - find the next child of a given css
* @pos: the current position (%NULL to initiate traversal)
* @parent: css whose children to walk
@@ -4133,7 +4528,7 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
* implies that if we observe !CSS_RELEASED on @pos in this RCU
* critical section, the one pointed to by its next pointer is
* guaranteed to not have finished its RCU grace period even if we
- * have dropped rcu_read_lock() inbetween iterations.
+ * have dropped rcu_read_lock() in-between iterations.
*
* If @pos has CSS_RELEASED set, its next pointer can't be
* dereferenced; however, as each css is given a monotonically
@@ -4148,7 +4543,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
} else if (likely(!(pos->flags & CSS_RELEASED))) {
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
} else {
- list_for_each_entry_rcu(next, &parent->children, sibling)
+ list_for_each_entry_rcu(next, &parent->children, sibling,
+ lockdep_is_held(&cgroup_mutex))
if (next->serial_nr > pos->serial_nr)
break;
}
@@ -4380,7 +4776,7 @@ static struct css_set *css_task_iter_next_css_set(struct css_task_iter *it)
}
/**
- * css_task_iter_advance_css_set - advance a task itererator to the next css_set
+ * css_task_iter_advance_css_set - advance a task iterator to the next css_set
* @it: the iterator to advance
*
* Advance @it to the next css_set to walk.
@@ -4391,29 +4787,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
lockdep_assert_held(&css_set_lock);
- /* Advance to the next non-empty css_set */
- do {
- cset = css_task_iter_next_css_set(it);
- if (!cset) {
- it->task_pos = NULL;
- return;
+ /* Advance to the next non-empty css_set and find first non-empty tasks list*/
+ while ((cset = css_task_iter_next_css_set(it))) {
+ if (!list_empty(&cset->tasks)) {
+ it->cur_tasks_head = &cset->tasks;
+ break;
+ } else if (!list_empty(&cset->mg_tasks)) {
+ it->cur_tasks_head = &cset->mg_tasks;
+ break;
+ } else if (!list_empty(&cset->dying_tasks)) {
+ it->cur_tasks_head = &cset->dying_tasks;
+ break;
}
- } while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
-
- if (!list_empty(&cset->tasks)) {
- it->task_pos = cset->tasks.next;
- it->cur_tasks_head = &cset->tasks;
- } else if (!list_empty(&cset->mg_tasks)) {
- it->task_pos = cset->mg_tasks.next;
- it->cur_tasks_head = &cset->mg_tasks;
- } else {
- it->task_pos = cset->dying_tasks.next;
- it->cur_tasks_head = &cset->dying_tasks;
}
-
- it->tasks_head = &cset->tasks;
- it->mg_tasks_head = &cset->mg_tasks;
- it->dying_tasks_head = &cset->dying_tasks;
+ if (!cset) {
+ it->task_pos = NULL;
+ return;
+ }
+ it->task_pos = it->cur_tasks_head->next;
/*
* We don't keep css_sets locked across iteration steps and thus
@@ -4458,24 +4849,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
repeat:
if (it->task_pos) {
/*
- * Advance iterator to find next entry. cset->tasks is
- * consumed first and then ->mg_tasks. After ->mg_tasks,
- * we move onto the next cset.
+ * Advance iterator to find next entry. We go through cset
+ * tasks, mg_tasks and dying_tasks, when consumed we move onto
+ * the next cset.
*/
if (it->flags & CSS_TASK_ITER_SKIPPED)
it->flags &= ~CSS_TASK_ITER_SKIPPED;
else
it->task_pos = it->task_pos->next;
- if (it->task_pos == it->tasks_head) {
- it->task_pos = it->mg_tasks_head->next;
- it->cur_tasks_head = it->mg_tasks_head;
+ if (it->task_pos == &it->cur_cset->tasks) {
+ it->cur_tasks_head = &it->cur_cset->mg_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->mg_tasks_head) {
- it->task_pos = it->dying_tasks_head->next;
- it->cur_tasks_head = it->dying_tasks_head;
+ if (it->task_pos == &it->cur_cset->mg_tasks) {
+ it->cur_tasks_head = &it->cur_cset->dying_tasks;
+ it->task_pos = it->cur_tasks_head->next;
}
- if (it->task_pos == it->dying_tasks_head)
+ if (it->task_pos == &it->cur_cset->dying_tasks)
css_task_iter_advance_css_set(it);
} else {
/* called from start, proceed to the first cset */
@@ -4493,12 +4884,12 @@ repeat:
goto repeat;
/* and dying leaders w/o live member threads */
- if (it->cur_tasks_head == it->dying_tasks_head &&
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
!atomic_read(&task->signal->live))
goto repeat;
} else {
/* skip all dying ones */
- if (it->cur_tasks_head == it->dying_tasks_head)
+ if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
goto repeat;
}
}
@@ -4524,7 +4915,7 @@ void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
it->ss = css->ss;
it->flags = flags;
- if (it->ss)
+ if (CGROUP_HAS_SUBSYS_CONFIG && it->ss)
it->cset_pos = &css->cgroup->e_csets[css->ss->id];
else
it->cset_pos = &css->cgroup->cset_links;
@@ -4593,21 +4984,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4615,21 +5006,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4662,13 +5050,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
return 0;
}
+static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
+{
+ int ret;
+ struct inode *inode;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
+ if (!inode)
+ return -ENOMEM;
+
+ ret = inode_permission(&init_user_ns, inode, MAY_WRITE);
+ iput(inode);
+ return ret;
+}
+
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
- struct inode *inode;
int ret;
lockdep_assert_held(&cgroup_mutex);
@@ -4678,12 +5081,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
com_cgrp = cgroup_parent(com_cgrp);
/* %current should be authorized to migrate to the common ancestor */
- inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
- if (!inode)
- return -ENOMEM;
-
- ret = inode_permission(inode, MAY_WRITE);
- iput(inode);
+ ret = cgroup_may_write(com_cgrp, sb);
if (ret)
return ret;
@@ -4699,19 +5097,42 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
return 0;
}
-static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
+static int cgroup_attach_permissions(struct cgroup *src_cgrp,
+ struct cgroup *dst_cgrp,
+ struct super_block *sb, bool threadgroup,
+ struct cgroup_namespace *ns)
{
+ int ret = 0;
+
+ ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns);
+ if (ret)
+ return ret;
+
+ ret = cgroup_migrate_vet_dst(dst_cgrp);
+ if (ret)
+ return ret;
+
+ if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
+ ret = -EOPNOTSUPP;
+
+ return ret;
+}
+
+static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ bool threadgroup)
+{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
- bool locked;
+ bool threadgroup_locked;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
if (!dst_cgrp)
return -ENODEV;
- task = cgroup_procs_write_start(buf, true, &locked);
+ task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
ret = PTR_ERR_OR_ZERO(task);
if (ret)
goto out_unlock;
@@ -4721,19 +5142,33 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
+ ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
+ of->file->f_path.dentry->d_sb,
+ threadgroup, ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
- ret = cgroup_attach_task(dst_cgrp, task, true);
+ ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
out_finish:
- cgroup_procs_write_finish(task, locked);
+ cgroup_procs_write_finish(task, threadgroup_locked);
out_unlock:
cgroup_kn_unlock(of->kn);
- return ret ?: nbytes;
+ return ret;
+}
+
+static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ return __cgroup_procs_write(of, buf, true) ?: nbytes;
}
static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
@@ -4744,46 +5179,7 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- struct cgroup *src_cgrp, *dst_cgrp;
- struct task_struct *task;
- ssize_t ret;
- bool locked;
-
- buf = strstrip(buf);
-
- dst_cgrp = cgroup_kn_lock_live(of->kn, false);
- if (!dst_cgrp)
- return -ENODEV;
-
- task = cgroup_procs_write_start(buf, false, &locked);
- ret = PTR_ERR_OR_ZERO(task);
- if (ret)
- goto out_unlock;
-
- /* find the source cgroup */
- spin_lock_irq(&css_set_lock);
- src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_irq(&css_set_lock);
-
- /* thread migrations follow the cgroup.procs delegation rule */
- ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
- if (ret)
- goto out_finish;
-
- /* and must be contained in the same domain */
- ret = -EOPNOTSUPP;
- if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
- goto out_finish;
-
- ret = cgroup_attach_task(dst_cgrp, task, false);
-
-out_finish:
- cgroup_procs_write_finish(task, locked);
-out_unlock:
- cgroup_kn_unlock(of->kn);
-
- return ret ?: nbytes;
+ return __cgroup_procs_write(of, buf, false) ?: nbytes;
}
/* cgroup core interface files for the default hierarchy */
@@ -4850,13 +5246,22 @@ static struct cftype cgroup_base_files[] = {
.write = cgroup_freeze_write,
},
{
- .name = "cpu.stat",
+ .name = "cgroup.kill",
.flags = CFTYPE_NOT_ON_ROOT,
+ .write = cgroup_kill_write,
+ },
+ {
+ .name = "cpu.stat",
.seq_show = cpu_stat_show,
},
+ { } /* terminate */
+};
+
+static struct cftype cgroup_psi_files[] = {
#ifdef CONFIG_PSI
{
.name = "io.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4864,6 +5269,7 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "memory.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -4871,11 +5277,27 @@ static struct cftype cgroup_base_files[] = {
},
{
.name = "cpu.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
.release = cgroup_pressure_release,
},
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ {
+ .name = "irq.pressure",
+ .file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
+ .seq_show = cgroup_irq_pressure_show,
+ .write = cgroup_irq_pressure_write,
+ .poll = cgroup_pressure_poll,
+ .release = cgroup_pressure_release,
+ },
+#endif
+ {
+ .name = "cgroup.pressure",
+ .seq_show = cgroup_pressure_show,
+ .write = cgroup_pressure_write,
+ },
#endif /* CONFIG_PSI */
{ } /* terminate */
};
@@ -4938,8 +5360,7 @@ static void css_free_rwork_fn(struct work_struct *work)
cgroup_put(cgroup_parent(cgrp));
kernfs_put(cgrp->kn);
psi_cgroup_free(cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
kfree(cgrp);
} else {
/*
@@ -4980,8 +5401,7 @@ static void css_release_work_fn(struct work_struct *work)
/* cgroup release path */
TRACE_CGROUP_PATH(release, cgrp);
- if (cgroup_on_dfl(cgrp))
- cgroup_rstat_flush(cgrp);
+ cgroup_rstat_flush(cgrp);
spin_lock_irq(&css_set_lock);
for (tcgrp = cgroup_parent(cgrp); tcgrp;
@@ -5038,7 +5458,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
css_get(css->parent);
}
- if (cgroup_on_dfl(cgrp) && ss->css_rstat_flush)
+ if (ss->css_rstat_flush)
list_add_rcu(&css->rstat_css_node, &cgrp->rstat_css_list);
BUG_ON(cgroup_css(cgrp, ss));
@@ -5128,15 +5548,6 @@ static struct cgroup_subsys_state *css_create(struct cgroup *cgrp,
if (err)
goto err_list_del;
- if (ss->broken_hierarchy && !ss->warned_broken_hierarchy &&
- cgroup_parent(parent)) {
- pr_warn("%s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n",
- current->comm, current->pid, ss->name);
- if (!strcmp(ss->name, "memory"))
- pr_warn("\"memory\" requires setting use_hierarchy to 1 on the root\n");
- ss->warned_broken_hierarchy = true;
- }
-
return css;
err_list_del:
@@ -5163,8 +5574,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
int ret;
/* allocate the cgroup and its ID, 0 is reserved for the root */
- cgrp = kzalloc(struct_size(cgrp, ancestor_ids, (level + 1)),
- GFP_KERNEL);
+ cgrp = kzalloc(struct_size(cgrp, ancestors, (level + 1)), GFP_KERNEL);
if (!cgrp)
return ERR_PTR(-ENOMEM);
@@ -5172,11 +5582,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
if (ret)
goto out_free_cgrp;
- if (cgroup_on_dfl(parent)) {
- ret = cgroup_rstat_init(cgrp);
- if (ret)
- goto out_cancel_ref;
- }
+ ret = cgroup_rstat_init(cgrp);
+ if (ret)
+ goto out_cancel_ref;
/* create the directory */
kn = kernfs_create_dir(parent->kn, name, mode, cgrp);
@@ -5218,7 +5626,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent, const char *name,
spin_lock_irq(&css_set_lock);
for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) {
- cgrp->ancestor_ids[tcgrp->level] = cgroup_id(tcgrp);
+ cgrp->ancestors[tcgrp->level] = tcgrp;
if (tcgrp != cgrp) {
tcgrp->nr_descendants++;
@@ -5263,8 +5671,7 @@ out_psi_free:
out_kernfs_remove:
kernfs_remove(cgrp->kn);
out_stat_exit:
- if (cgroup_on_dfl(parent))
- cgroup_rstat_exit(cgrp);
+ cgroup_rstat_exit(cgrp);
out_cancel_ref:
percpu_ref_exit(&cgrp->self.refcnt);
out_free_cgrp:
@@ -5355,7 +5762,7 @@ out_unlock:
/*
* This is called when the refcnt of a css is confirmed to be killed.
* css_tryget_online() is now guaranteed to fail. Tell the subsystem to
- * initate destruction and put the css ref from kill_css().
+ * initiate destruction and put the css ref from kill_css().
*/
static void css_killed_work_fn(struct work_struct *work)
{
@@ -5499,7 +5906,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
css_clear_dir(&cgrp->self);
kernfs_remove(cgrp->kn);
- if (parent && cgroup_is_threaded(cgrp))
+ if (cgroup_is_threaded(cgrp))
parent->nr_threaded_children--;
spin_lock_irq(&css_set_lock);
@@ -5562,7 +5969,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
- css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
+ css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@@ -5639,8 +6046,6 @@ int __init cgroup_init_early(void)
return 0;
}
-static u16 cgroup_disable_mask __initdata;
-
/**
* cgroup_init - cgroup initialization
*
@@ -5654,16 +6059,11 @@ int __init cgroup_init(void)
BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16);
BUG_ON(cgroup_init_cftypes(NULL, cgroup_base_files));
+ BUG_ON(cgroup_init_cftypes(NULL, cgroup_psi_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup1_base_files));
cgroup_rstat_boot();
- /*
- * The latency of the synchronize_rcu() is too high for cgroups,
- * avoid it at the cost of forcing all readers into the slow path.
- */
- rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
-
get_user_ns(init_cgroup_ns.user_ns);
mutex_lock(&cgroup_mutex);
@@ -5699,12 +6099,8 @@ int __init cgroup_init(void)
* disabled flag and cftype registration needs kmalloc,
* both of which aren't available during early_init.
*/
- if (cgroup_disable_mask & (1 << ssid)) {
- static_branch_disable(cgroup_subsys_enabled_key[ssid]);
- printk(KERN_INFO "Disabling %s control group subsystem\n",
- ss->name);
+ if (!cgroup_ssid_enabled(ssid))
continue;
- }
if (cgroup1_ssid_disabled(ssid))
printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n",
@@ -5782,6 +6178,48 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen)
}
/*
+ * cgroup_get_from_id : get the cgroup associated with cgroup id
+ * @id: cgroup id
+ * On success return the cgrp or ERR_PTR on failure
+ * Only cgroups within current task's cgroup NS are valid.
+ */
+struct cgroup *cgroup_get_from_id(u64 id)
+{
+ struct kernfs_node *kn;
+ struct cgroup *cgrp, *root_cgrp;
+
+ kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
+ if (!kn)
+ return ERR_PTR(-ENOENT);
+
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ kernfs_put(kn);
+ return ERR_PTR(-ENOENT);
+ }
+
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (cgrp && !cgroup_tryget(cgrp))
+ cgrp = NULL;
+
+ rcu_read_unlock();
+ kernfs_put(kn);
+
+ if (!cgrp)
+ return ERR_PTR(-ENOENT);
+
+ root_cgrp = current_cgns_cgroup_dfl();
+ if (!cgroup_is_descendant(cgrp, root_cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return cgrp;
+}
+EXPORT_SYMBOL_GPL(cgroup_get_from_id);
+
+/*
* proc_cgroup_show()
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
@@ -5806,7 +6244,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
struct cgroup *cgrp;
int ssid, count = 0;
- if (root == &cgrp_dfl_root && !cgrp_dfl_visible)
+ if (root == &cgrp_dfl_root && !READ_ONCE(cgrp_dfl_visible))
continue;
seq_printf(m, "%d:", root->hierarchy_id);
@@ -5864,8 +6302,7 @@ out:
* @child: pointer to task_struct of forking parent process.
*
* A task is associated with the init_css_set until cgroup_post_fork()
- * attaches it to the parent's css_set. Empty cg_list indicates that
- * @child isn't holding reference to its css_set.
+ * attaches it to the target css_set.
*/
void cgroup_fork(struct task_struct *child)
{
@@ -5874,20 +6311,208 @@ void cgroup_fork(struct task_struct *child)
}
/**
- * cgroup_can_fork - called on a new task before the process is exposed
- * @child: the task in question.
+ * cgroup_v1v2_get_from_file - get a cgroup pointer from a file pointer
+ * @f: file corresponding to cgroup_dir
*
- * This calls the subsystem can_fork() callbacks. If the can_fork() callback
- * returns an error, the fork aborts with that error code. This allows for
- * a cgroup subsystem to conditionally allow or deny new forks.
+ * Find the cgroup from a file pointer associated with a cgroup directory.
+ * Returns a pointer to the cgroup on success. ERR_PTR is returned if the
+ * cgroup cannot be found.
*/
-int cgroup_can_fork(struct task_struct *child)
+static struct cgroup *cgroup_v1v2_get_from_file(struct file *f)
+{
+ struct cgroup_subsys_state *css;
+
+ css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ if (IS_ERR(css))
+ return ERR_CAST(css);
+
+ return css->cgroup;
+}
+
+/**
+ * cgroup_get_from_file - same as cgroup_v1v2_get_from_file, but only supports
+ * cgroup2.
+ * @f: file corresponding to cgroup2_dir
+ */
+static struct cgroup *cgroup_get_from_file(struct file *f)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_file(f);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
+
+ if (!cgroup_on_dfl(cgrp)) {
+ cgroup_put(cgrp);
+ return ERR_PTR(-EBADF);
+ }
+
+ return cgrp;
+}
+
+/**
+ * cgroup_css_set_fork - find or create a css_set for a child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This functions finds or creates a new css_set which the child
+ * process will be attached to in cgroup_post_fork(). By default,
+ * the child process will be given the same css_set as its parent.
+ *
+ * If CLONE_INTO_CGROUP is specified this function will try to find an
+ * existing css_set which includes the requested cgroup and if not create
+ * a new css_set that the child will be attached to later. If this function
+ * succeeds it will hold cgroup_threadgroup_rwsem on return. If
+ * CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
+ * before grabbing cgroup_threadgroup_rwsem and will hold a reference
+ * to the target cgroup.
+ */
+static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
+ __acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
+{
+ int ret;
+ struct cgroup *dst_cgrp = NULL;
+ struct css_set *cset;
+ struct super_block *sb;
+ struct file *f;
+
+ if (kargs->flags & CLONE_INTO_CGROUP)
+ mutex_lock(&cgroup_mutex);
+
+ cgroup_threadgroup_change_begin(current);
+
+ spin_lock_irq(&css_set_lock);
+ cset = task_css_set(current);
+ get_css_set(cset);
+ spin_unlock_irq(&css_set_lock);
+
+ if (!(kargs->flags & CLONE_INTO_CGROUP)) {
+ kargs->cset = cset;
+ return 0;
+ }
+
+ f = fget_raw(kargs->cgroup);
+ if (!f) {
+ ret = -EBADF;
+ goto err;
+ }
+ sb = f->f_path.dentry->d_sb;
+
+ dst_cgrp = cgroup_get_from_file(f);
+ if (IS_ERR(dst_cgrp)) {
+ ret = PTR_ERR(dst_cgrp);
+ dst_cgrp = NULL;
+ goto err;
+ }
+
+ if (cgroup_is_dead(dst_cgrp)) {
+ ret = -ENODEV;
+ goto err;
+ }
+
+ /*
+ * Verify that we the target cgroup is writable for us. This is
+ * usually done by the vfs layer but since we're not going through
+ * the vfs layer here we need to do it "manually".
+ */
+ ret = cgroup_may_write(dst_cgrp, sb);
+ if (ret)
+ goto err;
+
+ /*
+ * Spawning a task directly into a cgroup works by passing a file
+ * descriptor to the target cgroup directory. This can even be an O_PATH
+ * file descriptor. But it can never be a cgroup.procs file descriptor.
+ * This was done on purpose so spawning into a cgroup could be
+ * conceptualized as an atomic
+ *
+ * fd = openat(dfd_cgroup, "cgroup.procs", ...);
+ * write(fd, <child-pid>, ...);
+ *
+ * sequence, i.e. it's a shorthand for the caller opening and writing
+ * cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
+ * to always use the caller's credentials.
+ */
+ ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
+ !(kargs->flags & CLONE_THREAD),
+ current->nsproxy->cgroup_ns);
+ if (ret)
+ goto err;
+
+ kargs->cset = find_css_set(cset, dst_cgrp);
+ if (!kargs->cset) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ put_css_set(cset);
+ fput(f);
+ kargs->cgrp = dst_cgrp;
+ return ret;
+
+err:
+ cgroup_threadgroup_change_end(current);
+ mutex_unlock(&cgroup_mutex);
+ if (f)
+ fput(f);
+ if (dst_cgrp)
+ cgroup_put(dst_cgrp);
+ put_css_set(cset);
+ if (kargs->cset)
+ put_css_set(kargs->cset);
+ return ret;
+}
+
+/**
+ * cgroup_css_set_put_fork - drop references we took during fork
+ * @kargs: the arguments passed to create the child process
+ *
+ * Drop references to the prepared css_set and target cgroup if
+ * CLONE_INTO_CGROUP was requested.
+ */
+static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+{
+ cgroup_threadgroup_change_end(current);
+
+ if (kargs->flags & CLONE_INTO_CGROUP) {
+ struct cgroup *cgrp = kargs->cgrp;
+ struct css_set *cset = kargs->cset;
+
+ mutex_unlock(&cgroup_mutex);
+
+ if (cset) {
+ put_css_set(cset);
+ kargs->cset = NULL;
+ }
+
+ if (cgrp) {
+ cgroup_put(cgrp);
+ kargs->cgrp = NULL;
+ }
+ }
+}
+
+/**
+ * cgroup_can_fork - called on a new task before the process is exposed
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * This prepares a new css_set for the child process which the child will
+ * be attached to in cgroup_post_fork().
+ * This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
+ * callback returns an error, the fork aborts with that error code. This
+ * allows for a cgroup subsystem to conditionally allow or deny new forks.
+ */
+int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i, j, ret;
+ ret = cgroup_css_set_fork(kargs);
+ if (ret)
+ return ret;
+
do_each_subsys_mask(ss, i, have_canfork_callback) {
- ret = ss->can_fork(child);
+ ret = ss->can_fork(child, kargs->cset);
if (ret)
goto out_revert;
} while_each_subsys_mask();
@@ -5899,73 +6524,100 @@ out_revert:
if (j >= i)
break;
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
}
+ cgroup_css_set_put_fork(kargs);
+
return ret;
}
/**
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
- * @child: the task in question
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
*
* This calls the cancel_fork() callbacks if a fork failed *after*
- * cgroup_can_fork() succeded.
+ * cgroup_can_fork() succeeded and cleans up references we took to
+ * prepare a new css_set for the child process in cgroup_can_fork().
*/
-void cgroup_cancel_fork(struct task_struct *child)
+void cgroup_cancel_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
{
struct cgroup_subsys *ss;
int i;
for_each_subsys(ss, i)
if (ss->cancel_fork)
- ss->cancel_fork(child);
+ ss->cancel_fork(child, kargs->cset);
+
+ cgroup_css_set_put_fork(kargs);
}
/**
- * cgroup_post_fork - called on a new task after adding it to the task list
- * @child: the task in question
- *
- * Adds the task to the list running through its css_set if necessary and
- * call the subsystem fork() callbacks. Has to be after the task is
- * visible on the task list in case we race with the first call to
- * cgroup_task_iter_start() - to guarantee that the new task ends up on its
- * list.
+ * cgroup_post_fork - finalize cgroup setup for the child process
+ * @child: the child process
+ * @kargs: the arguments passed to create the child process
+ *
+ * Attach the child process to its css_set calling the subsystem fork()
+ * callbacks.
*/
-void cgroup_post_fork(struct task_struct *child)
+void cgroup_post_fork(struct task_struct *child,
+ struct kernel_clone_args *kargs)
+ __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
{
+ unsigned long cgrp_flags = 0;
+ bool kill = false;
struct cgroup_subsys *ss;
struct css_set *cset;
int i;
+ cset = kargs->cset;
+ kargs->cset = NULL;
+
spin_lock_irq(&css_set_lock);
/* init tasks are special, only link regular threads */
if (likely(child->pid)) {
+ if (kargs->cgrp)
+ cgrp_flags = kargs->cgrp->flags;
+ else
+ cgrp_flags = cset->dfl_cgrp->flags;
+
WARN_ON_ONCE(!list_empty(&child->cg_list));
- cset = task_css_set(current); /* current is @child's parent */
- get_css_set(cset);
cset->nr_tasks++;
css_set_move_task(child, NULL, cset, false);
+ } else {
+ put_css_set(cset);
+ cset = NULL;
}
- /*
- * If the cgroup has to be frozen, the new task has too. Let's set
- * the JOBCTL_TRAP_FREEZE jobctl bit to get the task into the
- * frozen state.
- */
- if (unlikely(cgroup_task_freeze(child))) {
- spin_lock(&child->sighand->siglock);
- WARN_ON_ONCE(child->frozen);
- child->jobctl |= JOBCTL_TRAP_FREEZE;
- spin_unlock(&child->sighand->siglock);
+ if (!(child->flags & PF_KTHREAD)) {
+ if (unlikely(test_bit(CGRP_FREEZE, &cgrp_flags))) {
+ /*
+ * If the cgroup has to be frozen, the new task has
+ * too. Let's set the JOBCTL_TRAP_FREEZE jobctl bit to
+ * get the task into the frozen state.
+ */
+ spin_lock(&child->sighand->siglock);
+ WARN_ON_ONCE(child->frozen);
+ child->jobctl |= JOBCTL_TRAP_FREEZE;
+ spin_unlock(&child->sighand->siglock);
+
+ /*
+ * Calling cgroup_update_frozen() isn't required here,
+ * because it will be called anyway a bit later from
+ * do_freezer_trap(). So we avoid cgroup's transient
+ * switch from the frozen state and back.
+ */
+ }
/*
- * Calling cgroup_update_frozen() isn't required here,
- * because it will be called anyway a bit later from
- * do_freezer_trap(). So we avoid cgroup's transient switch
- * from the frozen state and back.
+ * If the cgroup is to be killed notice it now and take the
+ * child down right after we finished preparing it for
+ * userspace.
*/
+ kill = test_bit(CGRP_KILL, &cgrp_flags);
}
spin_unlock_irq(&css_set_lock);
@@ -5978,6 +6630,21 @@ void cgroup_post_fork(struct task_struct *child)
do_each_subsys_mask(ss, i, have_fork_callback) {
ss->fork(child);
} while_each_subsys_mask();
+
+ /* Make the new cset the root_cset of the new cgroup namespace. */
+ if (kargs->flags & CLONE_NEWCGROUP) {
+ struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
+
+ get_css_set(cset);
+ child->nsproxy->cgroup_ns->root_cset = cset;
+ put_css_set(rcset);
+ }
+
+ /* Cgroup has to be killed so take down child immediately. */
+ if (unlikely(kill))
+ do_send_sig_info(SIGKILL, SEND_SIG_NOINFO, child, PIDTYPE_TGID);
+
+ cgroup_css_set_put_fork(kargs);
}
/**
@@ -6002,7 +6669,8 @@ void cgroup_exit(struct task_struct *tsk)
cset->nr_tasks--;
WARN_ON_ONCE(cgroup_task_frozen(tsk));
- if (unlikely(cgroup_task_freeze(tsk)))
+ if (unlikely(!(tsk->flags & PF_KTHREAD) &&
+ test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
cgroup_update_frozen(task_dfl_cgroup(tsk));
spin_unlock_irq(&css_set_lock);
@@ -6048,7 +6716,19 @@ static int __init cgroup_disable(char *str)
if (strcmp(token, ss->name) &&
strcmp(token, ss->legacy_name))
continue;
- cgroup_disable_mask |= 1 << i;
+
+ static_branch_disable(cgroup_subsys_enabled_key[i]);
+ pr_info("Disabling %s control group subsystem\n",
+ ss->name);
+ }
+
+ for (i = 0; i < OPT_FEATURE_COUNT; i++) {
+ if (strcmp(token, cgroup_opt_feature_names[i]))
+ continue;
+ cgroup_feature_disable_mask |= 1 << i;
+ pr_info("Disabling %s control group feature\n",
+ cgroup_opt_feature_names[i]);
+ break;
}
}
return 1;
@@ -6125,46 +6805,51 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
- struct cgroup *cgrp;
+ struct cgroup *cgrp = ERR_PTR(-ENOENT);
+ struct cgroup *root_cgrp;
- mutex_lock(&cgroup_mutex);
+ root_cgrp = current_cgns_cgroup_dfl();
+ kn = kernfs_walk_and_get(root_cgrp->kn, path);
+ if (!kn)
+ goto out;
- kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
- if (kn) {
- if (kernfs_type(kn) == KERNFS_DIR) {
- cgrp = kn->priv;
- cgroup_get_live(cgrp);
- } else {
- cgrp = ERR_PTR(-ENOTDIR);
- }
- kernfs_put(kn);
- } else {
- cgrp = ERR_PTR(-ENOENT);
+ if (kernfs_type(kn) != KERNFS_DIR) {
+ cgrp = ERR_PTR(-ENOTDIR);
+ goto out_kernfs;
}
- mutex_unlock(&cgroup_mutex);
+ rcu_read_lock();
+
+ cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+ if (!cgrp || !cgroup_tryget(cgrp))
+ cgrp = ERR_PTR(-ENOENT);
+
+ rcu_read_unlock();
+
+out_kernfs:
+ kernfs_put(kn);
+out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
/**
- * cgroup_get_from_fd - get a cgroup pointer from a fd
- * @fd: fd obtained by open(cgroup2_dir)
+ * cgroup_v1v2_get_from_fd - get a cgroup pointer from a fd
+ * @fd: fd obtained by open(cgroup_dir)
*
* Find the cgroup from a fd which should be obtained
* by opening a cgroup directory. Returns a pointer to the
* cgroup on success. ERR_PTR is returned if the cgroup
* cannot be found.
*/
-struct cgroup *cgroup_get_from_fd(int fd)
+struct cgroup *cgroup_v1v2_get_from_fd(int fd)
{
- struct cgroup_subsys_state *css;
struct cgroup *cgrp;
struct file *f;
@@ -6172,17 +6857,27 @@ struct cgroup *cgroup_get_from_fd(int fd)
if (!f)
return ERR_PTR(-EBADF);
- css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
+ cgrp = cgroup_v1v2_get_from_file(f);
fput(f);
- if (IS_ERR(css))
- return ERR_CAST(css);
+ return cgrp;
+}
+
+/**
+ * cgroup_get_from_fd - same as cgroup_v1v2_get_from_fd, but only supports
+ * cgroup2.
+ * @fd: fd obtained by open(cgroup2_dir)
+ */
+struct cgroup *cgroup_get_from_fd(int fd)
+{
+ struct cgroup *cgrp = cgroup_v1v2_get_from_fd(fd);
+
+ if (IS_ERR(cgrp))
+ return ERR_CAST(cgrp);
- cgrp = css->cgroup;
if (!cgroup_on_dfl(cgrp)) {
cgroup_put(cgrp);
return ERR_PTR(-EBADF);
}
-
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
@@ -6235,106 +6930,56 @@ int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v)
*/
#ifdef CONFIG_SOCK_CGROUP_DATA
-#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID)
-
-DEFINE_SPINLOCK(cgroup_sk_update_lock);
-static bool cgroup_sk_alloc_disabled __read_mostly;
-
-void cgroup_sk_alloc_disable(void)
-{
- if (cgroup_sk_alloc_disabled)
- return;
- pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n");
- cgroup_sk_alloc_disabled = true;
-}
-
-#else
-
-#define cgroup_sk_alloc_disabled false
-
-#endif
-
void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
{
- if (cgroup_sk_alloc_disabled)
- return;
-
- /* Socket clone path */
- if (skcd->val) {
- /*
- * We might be cloning a socket which is left in an empty
- * cgroup and the cgroup might have already been rmdir'd.
- * Don't use cgroup_get_live().
- */
- cgroup_get(sock_cgroup_ptr(skcd));
- cgroup_bpf_get(sock_cgroup_ptr(skcd));
- return;
- }
-
- /* Don't associate the sock with unrelated interrupted task's cgroup. */
- if (in_interrupt())
- return;
+ struct cgroup *cgroup;
rcu_read_lock();
+ /* Don't associate the sock with unrelated interrupted task's cgroup. */
+ if (in_interrupt()) {
+ cgroup = &cgrp_dfl_root.cgrp;
+ cgroup_get(cgroup);
+ goto out;
+ }
while (true) {
struct css_set *cset;
cset = task_css_set(current);
if (likely(cgroup_tryget(cset->dfl_cgrp))) {
- skcd->val = (unsigned long)cset->dfl_cgrp;
- cgroup_bpf_get(cset->dfl_cgrp);
+ cgroup = cset->dfl_cgrp;
break;
}
cpu_relax();
}
-
+out:
+ skcd->cgroup = cgroup;
+ cgroup_bpf_get(cgroup);
rcu_read_unlock();
}
-void cgroup_sk_free(struct sock_cgroup_data *skcd)
+void cgroup_sk_clone(struct sock_cgroup_data *skcd)
{
struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- cgroup_bpf_put(cgrp);
- cgroup_put(cgrp);
+ /*
+ * We might be cloning a socket which is left in an empty
+ * cgroup and the cgroup might have already been rmdir'd.
+ * Don't use cgroup_get_live().
+ */
+ cgroup_get(cgrp);
+ cgroup_bpf_get(cgrp);
}
-#endif /* CONFIG_SOCK_CGROUP_DATA */
-
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
- struct bpf_prog *replace_prog, enum bpf_attach_type type,
- u32 flags)
-{
- int ret;
-
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, type, flags);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
- enum bpf_attach_type type, u32 flags)
+void cgroup_sk_free(struct sock_cgroup_data *skcd)
{
- int ret;
+ struct cgroup *cgrp = sock_cgroup_ptr(skcd);
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_detach(cgrp, prog, type);
- mutex_unlock(&cgroup_mutex);
- return ret;
+ cgroup_bpf_put(cgrp);
+ cgroup_put(cgrp);
}
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
- union bpf_attr __user *uattr)
-{
- int ret;
- mutex_lock(&cgroup_mutex);
- ret = __cgroup_bpf_query(cgrp, attr, uattr);
- mutex_unlock(&cgroup_mutex);
- return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
+#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
@@ -6366,8 +7011,11 @@ static ssize_t delegate_show(struct kobject *kobj, struct kobj_attribute *attr,
int ssid;
ssize_t ret = 0;
- ret = show_delegatable_files(cgroup_base_files, buf, PAGE_SIZE - ret,
- NULL);
+ ret = show_delegatable_files(cgroup_base_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
+ if (cgroup_psi_enabled())
+ ret += show_delegatable_files(cgroup_psi_files, buf + ret,
+ PAGE_SIZE - ret, NULL);
for_each_subsys(ss, ssid)
ret += show_delegatable_files(ss->dfl_cftypes, buf + ret,
@@ -6381,7 +7029,11 @@ static struct kobj_attribute cgroup_delegate_attr = __ATTR_RO(delegate);
static ssize_t features_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return snprintf(buf, PAGE_SIZE, "nsdelegate\nmemory_localevents\n");
+ return snprintf(buf, PAGE_SIZE,
+ "nsdelegate\n"
+ "favordynmods\n"
+ "memory_localevents\n"
+ "memory_recursiveprot\n");
}
static struct kobj_attribute cgroup_features_attr = __ATTR_RO(features);