aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 14:51:32 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-05 14:51:32 -0800
commit69234acee54407962a20bedf90ef9c96326994b5 (patch)
tree5e979b1a489d866691c2c65ac3f46b4f29feef68 /include/linux
parentMerge branch 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/libata (diff)
parentcgroup: fix race condition around termination check in css_task_iter_next() (diff)
downloadlinux-dev-69234acee54407962a20bedf90ef9c96326994b5.tar.xz
linux-dev-69234acee54407962a20bedf90ef9c96326994b5.zip
Merge branch 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: "The cgroup core saw several significant updates this cycle: - percpu_rwsem for threadgroup locking is reinstated. This was temporarily dropped due to down_write latency issues. Oleg's rework of percpu_rwsem which is scheduled to be merged in this merge window resolves the issue. - On the v2 hierarchy, when controllers are enabled and disabled, all operations are atomic and can fail and revert cleanly. This allows ->can_attach() failure which is necessary for cpu RT slices. - Tasks now stay associated with the original cgroups after exit until released. This allows tracking resources held by zombies (e.g. pids) and makes it easy to find out where zombies came from on the v2 hierarchy. The pids controller was broken before these changes as zombies escaped the limits; unfortunately, updating this behavior required too many invasive changes and I don't think it's a good idea to backport them, so the pids controller on 4.3, the first version which included the pids controller, will stay broken at least until I'm sure about the cgroup core changes. - Optimization of a couple common tests using static_key" * 'for-4.4' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: (38 commits) cgroup: fix race condition around termination check in css_task_iter_next() blkcg: don't create "io.stat" on the root cgroup cgroup: drop cgroup__DEVEL__legacy_files_on_dfl cgroup: replace error handling in cgroup_init() with WARN_ON()s cgroup: add cgroup_subsys->free() method and use it to fix pids controller cgroup: keep zombies associated with their original cgroups cgroup: make css_set_rwsem a spinlock and rename it to css_set_lock cgroup: don't hold css_set_rwsem across css task iteration cgroup: reorganize css_task_iter functions cgroup: factor out css_set_move_task() cgroup: keep css_set and task lists in chronological order cgroup: make cgroup_destroy_locked() test cgroup_is_populated() cgroup: make css_sets pin the associated cgroups cgroup: relocate cgroup_[try]get/put() cgroup: move check_for_release() invocation cgroup: replace cgroup_has_tasks() with cgroup_is_populated() cgroup: make cgroup->nr_populated count the number of populated css_sets cgroup: remove an unused parameter from cgroup_task_migrate() cgroup: fix too early usage of static_branch_disable() cgroup: make cgroup_update_dfl_csses() migrate all target processes atomically ...
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/backing-dev.h5
-rw-r--r--include/linux/cgroup-defs.h76
-rw-r--r--include/linux/cgroup.h129
-rw-r--r--include/linux/hugetlb_cgroup.h4
-rw-r--r--include/linux/init_task.h8
-rw-r--r--include/linux/jump_label.h18
-rw-r--r--include/linux/memcontrol.h8
-rw-r--r--include/linux/sched.h12
8 files changed, 144 insertions, 116 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index c85f74946a8b..c82794f20110 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,7 +13,6 @@
#include <linux/sched.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
-#include <linux/memcontrol.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev-defs.h>
#include <linux/slab.h>
@@ -267,8 +266,8 @@ static inline bool inode_cgwb_enabled(struct inode *inode)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
- return cgroup_on_dfl(mem_cgroup_root_css->cgroup) &&
- cgroup_on_dfl(blkcg_root_css->cgroup) &&
+ return cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
+ cgroup_subsys_on_dfl(io_cgrp_subsys) &&
bdi_cap_account_dirty(bdi) &&
(bdi->capabilities & BDI_CAP_CGROUP_WRITEBACK) &&
(inode->i_sb->s_iflags & SB_I_CGROUPWB);
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 8492721b39be..60d44b26276d 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -76,6 +76,7 @@ enum {
CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */
CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */
CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */
+ CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */
/* internal flags, do not use outside cgroup core proper */
__CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */
@@ -83,6 +84,17 @@ enum {
};
/*
+ * cgroup_file is the handle for a file instance created in a cgroup which
+ * is used, for example, to generate file changed notifications. This can
+ * be obtained by setting cftype->file_offset.
+ */
+struct cgroup_file {
+ /* do not access any fields from outside cgroup core */
+ struct list_head node; /* anchored at css->files */
+ struct kernfs_node *kn;
+};
+
+/*
* Per-subsystem/per-cgroup state maintained by the system. This is the
* fundamental structural building block that controllers deal with.
*
@@ -122,6 +134,9 @@ struct cgroup_subsys_state {
*/
u64 serial_nr;
+ /* all cgroup_files associated with this css */
+ struct list_head files;
+
/* percpu_ref killing and RCU release */
struct rcu_head rcu_head;
struct work_struct destroy_work;
@@ -196,6 +211,9 @@ struct css_set {
*/
struct list_head e_cset_node[CGROUP_SUBSYS_COUNT];
+ /* all css_task_iters currently walking this cset */
+ struct list_head task_iters;
+
/* For RCU-protected deletion */
struct rcu_head rcu_head;
};
@@ -217,16 +235,16 @@ struct cgroup {
int id;
/*
- * If this cgroup contains any tasks, it contributes one to
- * populated_cnt. All children with non-zero popuplated_cnt of
- * their own contribute one. The count is zero iff there's no task
- * in this cgroup or its subtree.
+ * Each non-empty css_set associated with this cgroup contributes
+ * one to populated_cnt. All children with non-zero popuplated_cnt
+ * of their own contribute one. The count is zero iff there's no
+ * task in this cgroup or its subtree.
*/
int populated_cnt;
struct kernfs_node *kn; /* cgroup kernfs entry */
- struct kernfs_node *procs_kn; /* kn for "cgroup.procs" */
- struct kernfs_node *populated_kn; /* kn for "cgroup.subtree_populated" */
+ struct cgroup_file procs_file; /* handle for "cgroup.procs" */
+ struct cgroup_file events_file; /* handle for "cgroup.events" */
/*
* The bitmask of subsystems enabled on the child cgroups.
@@ -324,11 +342,6 @@ struct cftype {
*/
char name[MAX_CFTYPE_NAME];
unsigned long private;
- /*
- * If not 0, file mode is set to this value, otherwise it will
- * be figured out automatically
- */
- umode_t mode;
/*
* The maximum length of string, excluding trailing nul, that can
@@ -340,6 +353,14 @@ struct cftype {
unsigned int flags;
/*
+ * If non-zero, should contain the offset from the start of css to
+ * a struct cgroup_file field. cgroup will record the handle of
+ * the created file into it. The recorded handle can be used as
+ * long as the containing css remains accessible.
+ */
+ unsigned int file_offset;
+
+ /*
* Fields used for internal bookkeeping. Initialized automatically
* during registration.
*/
@@ -414,12 +435,10 @@ struct cgroup_subsys {
int (*can_fork)(struct task_struct *task, void **priv_p);
void (*cancel_fork)(struct task_struct *task, void *priv);
void (*fork)(struct task_struct *task, void *priv);
- void (*exit)(struct cgroup_subsys_state *css,
- struct cgroup_subsys_state *old_css,
- struct task_struct *task);
+ void (*exit)(struct task_struct *task);
+ void (*free)(struct task_struct *task);
void (*bind)(struct cgroup_subsys_state *root_css);
- int disabled;
int early_init;
/*
@@ -473,8 +492,31 @@ struct cgroup_subsys {
unsigned int depends_on;
};
-void cgroup_threadgroup_change_begin(struct task_struct *tsk);
-void cgroup_threadgroup_change_end(struct task_struct *tsk);
+extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
+
+/**
+ * cgroup_threadgroup_change_begin - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_begin() and allows cgroup operations to
+ * synchronize against threadgroup changes using a percpu_rw_semaphore.
+ */
+static inline void cgroup_threadgroup_change_begin(struct task_struct *tsk)
+{
+ percpu_down_read(&cgroup_threadgroup_rwsem);
+}
+
+/**
+ * cgroup_threadgroup_change_end - threadgroup exclusion for cgroups
+ * @tsk: target task
+ *
+ * Called from threadgroup_change_end(). Counterpart of
+ * cgroup_threadcgroup_change_begin().
+ */
+static inline void cgroup_threadgroup_change_end(struct task_struct *tsk)
+{
+ percpu_up_read(&cgroup_threadgroup_rwsem);
+}
#else /* CONFIG_CGROUPS */
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index eb7ca55f72ef..22e3754f89c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -13,10 +13,10 @@
#include <linux/nodemask.h>
#include <linux/rculist.h>
#include <linux/cgroupstats.h>
-#include <linux/rwsem.h>
#include <linux/fs.h>
#include <linux/seq_file.h>
#include <linux/kernfs.h>
+#include <linux/jump_label.h>
#include <linux/cgroup-defs.h>
@@ -41,6 +41,10 @@ struct css_task_iter {
struct list_head *task_pos;
struct list_head *tasks_head;
struct list_head *mg_tasks_head;
+
+ struct css_set *cur_cset;
+ struct task_struct *cur_task;
+ struct list_head iters_node; /* css_set->task_iters */
};
extern struct cgroup_root cgrp_dfl_root;
@@ -50,6 +54,26 @@ extern struct css_set init_css_set;
#include <linux/cgroup_subsys.h>
#undef SUBSYS
+#define SUBSYS(_x) \
+ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \
+ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key;
+#include <linux/cgroup_subsys.h>
+#undef SUBSYS
+
+/**
+ * cgroup_subsys_enabled - fast test on whether a subsys is enabled
+ * @ss: subsystem in question
+ */
+#define cgroup_subsys_enabled(ss) \
+ static_branch_likely(&ss ## _enabled_key)
+
+/**
+ * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy
+ * @ss: subsystem in question
+ */
+#define cgroup_subsys_on_dfl(ss) \
+ static_branch_likely(&ss ## _on_dfl_key)
+
bool css_has_online_children(struct cgroup_subsys_state *css);
struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss);
struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup,
@@ -78,6 +102,7 @@ extern void cgroup_cancel_fork(struct task_struct *p,
extern void cgroup_post_fork(struct task_struct *p,
void *old_ss_priv[CGROUP_CANFORK_COUNT]);
void cgroup_exit(struct task_struct *p);
+void cgroup_free(struct task_struct *p);
int cgroup_init_early(void);
int cgroup_init(void);
@@ -211,11 +236,33 @@ void css_task_iter_end(struct css_task_iter *it);
* cgroup_taskset_for_each - iterate cgroup_taskset
* @task: the loop cursor
* @tset: taskset to iterate
+ *
+ * @tset may contain multiple tasks and they may belong to multiple
+ * processes. When there are multiple tasks in @tset, if a task of a
+ * process is in @tset, all tasks of the process are in @tset. Also, all
+ * are guaranteed to share the same source and destination csses.
+ *
+ * Iteration is not in any specific order.
*/
#define cgroup_taskset_for_each(task, tset) \
for ((task) = cgroup_taskset_first((tset)); (task); \
(task) = cgroup_taskset_next((tset)))
+/**
+ * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset
+ * @leader: the loop cursor
+ * @tset: takset to iterate
+ *
+ * Iterate threadgroup leaders of @tset. For single-task migrations, @tset
+ * may not contain any.
+ */
+#define cgroup_taskset_for_each_leader(leader, tset) \
+ for ((leader) = cgroup_taskset_first((tset)); (leader); \
+ (leader) = cgroup_taskset_next((tset))) \
+ if ((leader) != (leader)->group_leader) \
+ ; \
+ else
+
/*
* Inline functions.
*/
@@ -320,11 +367,11 @@ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n)
*/
#ifdef CONFIG_PROVE_RCU
extern struct mutex cgroup_mutex;
-extern struct rw_semaphore css_set_rwsem;
+extern spinlock_t css_set_lock;
#define task_css_set_check(task, __c) \
rcu_dereference_check((task)->cgroups, \
lockdep_is_held(&cgroup_mutex) || \
- lockdep_is_held(&css_set_rwsem) || \
+ lockdep_is_held(&css_set_lock) || \
((task)->flags & PF_EXITING) || (__c))
#else
#define task_css_set_check(task, __c) \
@@ -412,68 +459,10 @@ static inline struct cgroup *task_cgroup(struct task_struct *task,
return task_css(task, subsys_id)->cgroup;
}
-/**
- * cgroup_on_dfl - test whether a cgroup is on the default hierarchy
- * @cgrp: the cgroup of interest
- *
- * The default hierarchy is the v2 interface of cgroup and this function
- * can be used to test whether a cgroup is on the default hierarchy for
- * cases where a subsystem should behave differnetly depending on the
- * interface version.
- *
- * The set of behaviors which change on the default hierarchy are still
- * being determined and the mount option is prefixed with __DEVEL__.
- *
- * List of changed behaviors:
- *
- * - Mount options "noprefix", "xattr", "clone_children", "release_agent"
- * and "name" are disallowed.
- *
- * - When mounting an existing superblock, mount options should match.
- *
- * - Remount is disallowed.
- *
- * - rename(2) is disallowed.
- *
- * - "tasks" is removed. Everything should be at process granularity. Use
- * "cgroup.procs" instead.
- *
- * - "cgroup.procs" is not sorted. pids will be unique unless they got
- * recycled inbetween reads.
- *
- * - "release_agent" and "notify_on_release" are removed. Replacement
- * notification mechanism will be implemented.
- *
- * - "cgroup.clone_children" is removed.
- *
- * - "cgroup.subtree_populated" is available. Its value is 0 if the cgroup
- * and its descendants contain no task; otherwise, 1. The file also
- * generates kernfs notification which can be monitored through poll and
- * [di]notify when the value of the file changes.
- *
- * - cpuset: tasks will be kept in empty cpusets when hotplug happens and
- * take masks of ancestors with non-empty cpus/mems, instead of being
- * moved to an ancestor.
- *
- * - cpuset: a task can be moved into an empty cpuset, and again it takes
- * masks of ancestors.
- *
- * - memcg: use_hierarchy is on by default and the cgroup file for the flag
- * is not created.
- *
- * - blkcg: blk-throttle becomes properly hierarchical.
- *
- * - debug: disallowed on the default hierarchy.
- */
-static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
-{
- return cgrp->root == &cgrp_dfl_root;
-}
-
/* no synchronization, the result can only be used as a hint */
-static inline bool cgroup_has_tasks(struct cgroup *cgrp)
+static inline bool cgroup_is_populated(struct cgroup *cgrp)
{
- return !list_empty(&cgrp->cset_links);
+ return cgrp->populated_cnt;
}
/* returns ino associated with a cgroup */
@@ -527,6 +516,19 @@ static inline void pr_cont_cgroup_path(struct cgroup *cgrp)
pr_cont_kernfs_path(cgrp->kn);
}
+/**
+ * cgroup_file_notify - generate a file modified event for a cgroup_file
+ * @cfile: target cgroup_file
+ *
+ * @cfile must have been obtained by setting cftype->file_offset.
+ */
+static inline void cgroup_file_notify(struct cgroup_file *cfile)
+{
+ /* might not have been created due to one of the CFTYPE selector flags */
+ if (cfile->kn)
+ kernfs_notify(cfile->kn);
+}
+
#else /* !CONFIG_CGROUPS */
struct cgroup_subsys_state;
@@ -546,6 +548,7 @@ static inline void cgroup_cancel_fork(struct task_struct *p,
static inline void cgroup_post_fork(struct task_struct *p,
void *ss_priv[CGROUP_CANFORK_COUNT]) {}
static inline void cgroup_exit(struct task_struct *p) {}
+static inline void cgroup_free(struct task_struct *p) {}
static inline int cgroup_init_early(void) { return 0; }
static inline int cgroup_init(void) { return 0; }
diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h
index bcc853eccc85..7edd30515298 100644
--- a/include/linux/hugetlb_cgroup.h
+++ b/include/linux/hugetlb_cgroup.h
@@ -48,9 +48,7 @@ int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg)
static inline bool hugetlb_cgroup_disabled(void)
{
- if (hugetlb_cgrp_subsys.disabled)
- return true;
- return false;
+ return !cgroup_subsys_enabled(hugetlb_cgrp_subsys);
}
extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 810a34f60424..1c1ff7e4faa4 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -25,13 +25,6 @@
extern struct files_struct init_files;
extern struct fs_struct init_fs;
-#ifdef CONFIG_CGROUPS
-#define INIT_GROUP_RWSEM(sig) \
- .group_rwsem = __RWSEM_INITIALIZER(sig.group_rwsem),
-#else
-#define INIT_GROUP_RWSEM(sig)
-#endif
-
#ifdef CONFIG_CPUSETS
#define INIT_CPUSET_SEQ(tsk) \
.mems_allowed_seq = SEQCNT_ZERO(tsk.mems_allowed_seq),
@@ -65,7 +58,6 @@ extern struct fs_struct init_fs;
INIT_PREV_CPUTIME(sig) \
.cred_guard_mutex = \
__MUTEX_INITIALIZER(sig.cred_guard_mutex), \
- INIT_GROUP_RWSEM(sig) \
}
extern struct nsproxy init_nsproxy;
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index f1094238ab2a..8dde55974f18 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -214,11 +214,6 @@ static inline int jump_label_apply_nops(struct module *mod)
#define STATIC_KEY_INIT STATIC_KEY_INIT_FALSE
#define jump_label_enabled static_key_enabled
-static inline bool static_key_enabled(struct static_key *key)
-{
- return static_key_count(key) > 0;
-}
-
static inline void static_key_enable(struct static_key *key)
{
int count = static_key_count(key);
@@ -265,6 +260,17 @@ struct static_key_false {
#define DEFINE_STATIC_KEY_FALSE(name) \
struct static_key_false name = STATIC_KEY_FALSE_INIT
+extern bool ____wrong_branch_error(void);
+
+#define static_key_enabled(x) \
+({ \
+ if (!__builtin_types_compatible_p(typeof(*x), struct static_key) && \
+ !__builtin_types_compatible_p(typeof(*x), struct static_key_true) &&\
+ !__builtin_types_compatible_p(typeof(*x), struct static_key_false)) \
+ ____wrong_branch_error(); \
+ static_key_count((struct static_key *)x) > 0; \
+})
+
#ifdef HAVE_JUMP_LABEL
/*
@@ -323,8 +329,6 @@ struct static_key_false {
* See jump_label_type() / jump_label_init_type().
*/
-extern bool ____wrong_branch_error(void);
-
#define static_branch_likely(x) \
({ \
bool branch; \
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 3e3318ddfc0e..27251ed428f7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -213,6 +213,9 @@ struct mem_cgroup {
/* OOM-Killer disable */
int oom_kill_disable;
+ /* handle for "memory.events" */
+ struct cgroup_file events_file;
+
/* protect arrays of thresholds */
struct mutex thresholds_lock;
@@ -285,6 +288,7 @@ static inline void mem_cgroup_events(struct mem_cgroup *memcg,
unsigned int nr)
{
this_cpu_add(memcg->stat->events[idx], nr);
+ cgroup_file_notify(&memcg->events_file);
}
bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
@@ -346,9 +350,7 @@ ino_t page_cgroup_ino(struct page *page);
static inline bool mem_cgroup_disabled(void)
{
- if (memory_cgrp_subsys.disabled)
- return true;
- return false;
+ return !cgroup_subsys_enabled(memory_cgrp_subsys);
}
/*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index c115d617739d..4effb1025fbb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -771,18 +771,6 @@ struct signal_struct {
unsigned audit_tty_log_passwd;
struct tty_audit_buf *tty_audit_buf;
#endif
-#ifdef CONFIG_CGROUPS
- /*
- * group_rwsem prevents new tasks from entering the threadgroup and
- * member tasks from exiting,a more specifically, setting of
- * PF_EXITING. fork and exit paths are protected with this rwsem
- * using threadgroup_change_begin/end(). Users which require
- * threadgroup to remain stable should use threadgroup_[un]lock()
- * which also takes care of exec path. Currently, cgroup is the
- * only user.
- */
- struct rw_semaphore group_rwsem;
-#endif
oom_flags_t oom_flags;
short oom_score_adj; /* OOM kill score adjustment */