From 8b5a5a9dbca914d1f7d70276024d1525a3c94081 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:54 -0700
Subject: cgroup: deprecate remount option changes

This patch marks the following features for deprecation.

* Rebinding subsys by remount: Never reached useful state - only works
  on empty hierarchies.

* release_agent update by remount: release_agent itself will be
  replaced with conventional fsnotify notification.

v2: Lennart pointed out that "name=" is necessary for mounts w/o any
    controller attached.  Drop "name=" deprecation.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Lennart Poettering <mzxreary@0pointer.de>
---
 kernel/cgroup.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..741c120e06c1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1294,6 +1294,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (ret)
 		goto out_unlock;
 
+	/* See feature-removal-schedule.txt */
+	if (opts.subsys_bits != root->actual_subsys_bits || opts.release_agent)
+		pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n",
+			   task_tgid_nr(current), current->comm);
+
 	/* Don't allow flags or name to change at remount */
 	if (opts.flags != root->flags ||
 	    (opts.name && strcmp(opts.name, root->name))) {
-- 
cgit v1.2.3-59-g8ed1b


From ff4c8d503e2583b485da46d0ec3dcc29639ab889 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:54 -0700
Subject: cgroup: move cgroup_clear_directory() call out of
 cgroup_populate_dir()

cgroup_populate_dir() currently clears all files and then repopulate
the directory; however, the clearing part is only useful when it's
called from cgroup_remount().  Relocate the invocation to
cgroup_remount().

This is to prepare for further cgroup file handling updates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 741c120e06c1..8f72853131d3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1313,7 +1313,8 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 		goto out_unlock;
 	}
 
-	/* (re)populate subsystem files */
+	/* clear out any existing files and repopulate subsystem files */
+	cgroup_clear_directory(cgrp->dentry);
 	cgroup_populate_dir(cgrp);
 
 	if (opts.release_agent)
@@ -3644,9 +3645,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	int err;
 	struct cgroup_subsys *ss;
 
-	/* First clear out any existing files */
-	cgroup_clear_directory(cgrp->dentry);
-
 	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
 	if (err < 0)
 		return err;
-- 
cgit v1.2.3-59-g8ed1b


From b0ca5a84fc3ad8f75bb2b7ac3dc6a77151cd3906 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:54 -0700
Subject: cgroup: build list of all cgroups under a given cgroupfs_root

Build a list of all cgroups anchored at cgroupfs_root->allcg_list and
going through cgroup->allcg_node.  The list is protected by
cgroup_mutex and will be used to improve cgroup file handling.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  2 ++
 kernel/cgroup.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 5a85b3415c1b..ad2a14680b7f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -191,6 +191,8 @@ struct cgroup {
 	 */
 	struct list_head css_sets;
 
+	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
+
 	/*
 	 * Linked list running through all cgroups that can
 	 * potentially be reaped by the release agent. Protected by
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 8f72853131d3..db4319c030d0 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -127,6 +127,9 @@ struct cgroupfs_root {
 	/* A list running through the active hierarchies */
 	struct list_head root_list;
 
+	/* All cgroups on this root, cgroup_mutex protected */
+	struct list_head allcg_list;
+
 	/* Hierarchy-specific flags */
 	unsigned long flags;
 
@@ -1350,11 +1353,14 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
 	struct cgroup *cgrp = &root->top_cgroup;
+
 	INIT_LIST_HEAD(&root->subsys_list);
 	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD(&root->allcg_list);
 	root->number_of_cgroups = 1;
 	cgrp->root = root;
 	cgrp->top_cgroup = cgrp;
+	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
 	init_cgroup_housekeeping(cgrp);
 }
 
@@ -3790,6 +3796,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 
+	list_add_tail(&cgrp->allcg_node, &root->allcg_list);
+
 	err = cgroup_populate_dir(cgrp);
 	/* If err < 0, we have a half-filled directory - oh well ;) */
 
@@ -3998,6 +4006,8 @@ again:
 	list_del_init(&cgrp->sibling);
 	cgroup_unlock_hierarchy(cgrp->root);
 
+	list_del_init(&cgrp->allcg_node);
+
 	d = dget(cgrp->dentry);
 
 	cgroup_d_remove_dir(d);
-- 
cgit v1.2.3-59-g8ed1b


From 8e3f6541d45e1a002801e56a19530a90f775deba Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: implement cgroup_add_cftypes() and friends

Currently, cgroup directories are populated by subsys->populate()
callback explicitly creating files on each cgroup creation.  This
level of flexibility isn't needed or desirable.  It provides largely
unused flexibility which call for abuses while severely limiting what
the core layer can do through the lack of structure and conventions.

Per each cgroup file type, the only distinction that cgroup users is
making is whether a cgroup is root or not, which can easily be
expressed with flags.

This patch introduces cgroup_add_cftypes().  These deal with cftypes
instead of individual files - controllers indicate that certain types
of files exist for certain subsystem.  Newly added CFTYPE_*_ON_ROOT
flags indicate whether a cftype should be excluded or created only on
the root cgroup.

cgroup_add_cftypes() can be called any time whether the target
subsystem is currently attached or not.  cgroup core will create files
on the existing cgroups as necessary.

Also, cgroup_subsys->base_cftypes is added to ease registration of the
base files for the subsystem.  If non-NULL on subsys init, the cftypes
pointed to by ->base_cftypes are automatically registered on subsys
init / load.

Further patches will convert the existing users and remove the file
based interface.  Note that this interface allows dynamic addition of
files to an active controller.  This will be used for sub-controller
modularity and unified hierarchy in the longer term.

This patch implements the new mechanism but doesn't apply it to any
user.

v2: replaced DECLARE_CGROUP_CFTYPES[_COND]() with
    cgroup_subsys->base_cftypes, which works better for cgroup_subsys
    which is loaded as module.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  33 ++++++++++++-
 kernel/cgroup.c        | 132 ++++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 162 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index ad2a14680b7f..af6211c7a42b 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -192,6 +192,7 @@ struct cgroup {
 	struct list_head css_sets;
 
 	struct list_head allcg_node;	/* cgroupfs_root->allcg_list */
+	struct list_head cft_q_node;	/* used during cftype add/rm */
 
 	/*
 	 * Linked list running through all cgroups that can
@@ -277,11 +278,17 @@ struct cgroup_map_cb {
  *	- the 'cftype' of the file is file->f_dentry->d_fsdata
  */
 
-#define MAX_CFTYPE_NAME 64
+/* cftype->flags */
+#define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
+#define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create onp root cg */
+
+#define MAX_CFTYPE_NAME		64
+
 struct cftype {
 	/*
 	 * By convention, the name should begin with the name of the
-	 * subsystem, followed by a period
+	 * subsystem, followed by a period.  Zero length string indicates
+	 * end of cftype array.
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
@@ -297,6 +304,9 @@ struct cftype {
 	 */
 	size_t max_write_len;
 
+	/* CFTYPE_* flags */
+	unsigned int flags;
+
 	int (*open)(struct inode *inode, struct file *file);
 	ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft,
 			struct file *file,
@@ -375,6 +385,16 @@ struct cftype {
 			struct eventfd_ctx *eventfd);
 };
 
+/*
+ * cftype_sets describe cftypes belonging to a subsystem and are chained at
+ * cgroup_subsys->cftsets.  Each cftset points to an array of cftypes
+ * terminated by zero length name.
+ */
+struct cftype_set {
+	struct list_head		node;	/* chained at subsys->cftsets */
+	const struct cftype		*cfts;
+};
+
 struct cgroup_scanner {
 	struct cgroup *cg;
 	int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan);
@@ -400,6 +420,8 @@ int cgroup_add_files(struct cgroup *cgrp,
 			const struct cftype cft[],
 			int count);
 
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
+
 int cgroup_is_removed(const struct cgroup *cgrp);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
@@ -502,6 +524,13 @@ struct cgroup_subsys {
 	struct idr idr;
 	spinlock_t id_lock;
 
+	/* list of cftype_sets */
+	struct list_head cftsets;
+
+	/* base cftypes, automatically [de]registered with subsys itself */
+	struct cftype *base_cftypes;
+	struct cftype_set base_cftset;
+
 	/* should be defined only by modular subsystems */
 	struct module *module;
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index db4319c030d0..df8fb82ef350 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2623,8 +2623,14 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dentry;
 	int error;
 	umode_t mode;
-
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
+
+	/* does @cft->flags tell us to skip creation on @cgrp? */
+	if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent)
+		return 0;
+	if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent)
+		return 0;
+
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
 		strcpy(name, subsys->name);
 		strcat(name, ".");
@@ -2660,6 +2666,95 @@ int cgroup_add_files(struct cgroup *cgrp,
 }
 EXPORT_SYMBOL_GPL(cgroup_add_files);
 
+static DEFINE_MUTEX(cgroup_cft_mutex);
+
+static void cgroup_cfts_prepare(void)
+	__acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex)
+{
+	/*
+	 * Thanks to the entanglement with vfs inode locking, we can't walk
+	 * the existing cgroups under cgroup_mutex and create files.
+	 * Instead, we increment reference on all cgroups and build list of
+	 * them using @cgrp->cft_q_node.  Grab cgroup_cft_mutex to ensure
+	 * exclusive access to the field.
+	 */
+	mutex_lock(&cgroup_cft_mutex);
+	mutex_lock(&cgroup_mutex);
+}
+
+static void cgroup_cfts_commit(struct cgroup_subsys *ss,
+			       const struct cftype *cfts)
+	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
+{
+	LIST_HEAD(pending);
+	struct cgroup *cgrp, *n;
+	int count = 0;
+
+	while (cfts[count].name[0] != '\0')
+		count++;
+
+	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
+	if (cfts && ss->root != &rootnode) {
+		list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) {
+			dget(cgrp->dentry);
+			list_add_tail(&cgrp->cft_q_node, &pending);
+		}
+	}
+
+	mutex_unlock(&cgroup_mutex);
+
+	/*
+	 * All new cgroups will see @cfts update on @ss->cftsets.  Add/rm
+	 * files for all cgroups which were created before.
+	 */
+	list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) {
+		struct inode *inode = cgrp->dentry->d_inode;
+
+		mutex_lock(&inode->i_mutex);
+		mutex_lock(&cgroup_mutex);
+		if (!cgroup_is_removed(cgrp))
+			cgroup_add_files(cgrp, ss, cfts, count);
+		mutex_unlock(&cgroup_mutex);
+		mutex_unlock(&inode->i_mutex);
+
+		list_del_init(&cgrp->cft_q_node);
+		dput(cgrp->dentry);
+	}
+
+	mutex_unlock(&cgroup_cft_mutex);
+}
+
+/**
+ * cgroup_add_cftypes - add an array of cftypes to a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Register @cfts to @ss.  Files described by @cfts are created for all
+ * existing cgroups to which @ss is attached and all future cgroups will
+ * have them too.  This function can be called anytime whether @ss is
+ * attached or not.
+ *
+ * Returns 0 on successful registration, -errno on failure.  Note that this
+ * function currently returns 0 as long as @cfts registration is successful
+ * even if some file creation attempts on existing cgroups fail.
+ */
+int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+	struct cftype_set *set;
+
+	set = kzalloc(sizeof(*set), GFP_KERNEL);
+	if (!set)
+		return -ENOMEM;
+
+	cgroup_cfts_prepare();
+	set->cfts = cfts;
+	list_add_tail(&set->node, &ss->cftsets);
+	cgroup_cfts_commit(ss, cfts);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
+
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
@@ -3660,10 +3755,25 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 			return err;
 	}
 
+	/* process cftsets of each subsystem */
 	for_each_subsys(cgrp->root, ss) {
+		struct cftype_set *set;
+
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
+
+		list_for_each_entry(set, &ss->cftsets, node) {
+			const struct cftype *cft;
+
+			for (cft = set->cfts; cft->name[0] != '\0'; cft++) {
+				err = cgroup_add_file(cgrp, ss, cft);
+				if (err)
+					pr_warning("cgroup_populate_dir: failed to create %s, err=%d\n",
+						   cft->name, err);
+			}
+		}
 	}
+
 	/* This cgroup is ready now */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
@@ -4034,12 +4144,29 @@ again:
 	return 0;
 }
 
+static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss)
+{
+	INIT_LIST_HEAD(&ss->cftsets);
+
+	/*
+	 * base_cftset is embedded in subsys itself, no need to worry about
+	 * deregistration.
+	 */
+	if (ss->base_cftypes) {
+		ss->base_cftset.cfts = ss->base_cftypes;
+		list_add_tail(&ss->base_cftset.node, &ss->cftsets);
+	}
+}
+
 static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 {
 	struct cgroup_subsys_state *css;
 
 	printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 
+	/* init base cftset */
+	cgroup_init_cftsets(ss);
+
 	/* Create the top cgroup state for this subsystem */
 	list_add(&ss->sibling, &rootnode.subsys_list);
 	ss->root = &rootnode;
@@ -4109,6 +4236,9 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss)
 		return 0;
 	}
 
+	/* init base cftset */
+	cgroup_init_cftsets(ss);
+
 	/*
 	 * need to register a subsys id before anything else - for example,
 	 * init_cgroup_css needs it.
-- 
cgit v1.2.3-59-g8ed1b


From 6e6ff25bd548a0c5bf5163e4f63e2793dcefbdcb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: merge cft_release_agent cftype array into the base files
 array

Now that cftype can express whether a file should only be on root,
cft_release_agent can be merged into the base files cftypes array.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index df8fb82ef350..66c1e3b5c0f2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3732,13 +3732,13 @@ static struct cftype files[] = {
 		.read_u64 = cgroup_clone_children_read,
 		.write_u64 = cgroup_clone_children_write,
 	},
-};
-
-static struct cftype cft_release_agent = {
-	.name = "release_agent",
-	.read_seq_string = cgroup_release_agent_show,
-	.write_string = cgroup_release_agent_write,
-	.max_write_len = PATH_MAX,
+	{
+		.name = "release_agent",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_seq_string = cgroup_release_agent_show,
+		.write_string = cgroup_release_agent_write,
+		.max_write_len = PATH_MAX,
+	},
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3750,11 +3750,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	if (err < 0)
 		return err;
 
-	if (cgrp == cgrp->top_cgroup) {
-		if ((err = cgroup_add_file(cgrp, NULL, &cft_release_agent)) < 0)
-			return err;
-	}
-
 	/* process cftsets of each subsystem */
 	for_each_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
-- 
cgit v1.2.3-59-g8ed1b


From 4baf6e33251b37f111e21289f8ee71fe4cce236e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: convert all non-memcg controllers to the new cftype interface

Convert debug, freezer, cpuset, cpu_cgroup, cpuacct, net_prio, blkio,
net_cls and device controllers to use the new cftype based interface.
Termination entry is added to cftype arrays and populate callbacks are
replaced with cgroup_subsys->base_cftypes initializations.

This is functionally identical transformation.  There shouldn't be any
visible behavior change.

memcg is rather special and will be converted separately.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <paul@paulmenage.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Vivek Goyal <vgoyal@redhat.com>
---
 block/blk-cgroup.c        |  9 ++-------
 kernel/cgroup.c           | 10 +++-------
 kernel/cgroup_freezer.c   | 11 +++--------
 kernel/cpuset.c           | 31 ++++++++++---------------------
 kernel/sched/core.c       | 16 ++++------------
 net/core/netprio_cgroup.c |  8 ++------
 net/sched/cls_cgroup.c    |  8 ++------
 security/device_cgroup.c  | 10 ++--------
 8 files changed, 28 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 60ef16d6d155..126c341955de 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1515,14 +1515,9 @@ struct cftype blkio_files[] = {
 		.read_map = blkiocg_file_read_map,
 	},
 #endif
+	{ }	/* terminate */
 };
 
-static int blkiocg_populate(struct cgroup_subsys *subsys, struct cgroup *cgroup)
-{
-	return cgroup_add_files(cgroup, subsys, blkio_files,
-				ARRAY_SIZE(blkio_files));
-}
-
 static void blkiocg_destroy(struct cgroup *cgroup)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
@@ -1642,11 +1637,11 @@ struct cgroup_subsys blkio_subsys = {
 	.can_attach = blkiocg_can_attach,
 	.attach = blkiocg_attach,
 	.destroy = blkiocg_destroy,
-	.populate = blkiocg_populate,
 #ifdef CONFIG_BLK_CGROUP
 	/* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */
 	.subsys_id = blkio_subsys_id,
 #endif
+	.base_cftypes = blkio_files,
 	.use_id = 1,
 	.module = THIS_MODULE,
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 66c1e3b5c0f2..835ae29e4ea2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5349,19 +5349,15 @@ static struct cftype debug_files[] =  {
 		.name = "releasable",
 		.read_u64 = releasable_read,
 	},
-};
 
-static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	return cgroup_add_files(cont, ss, debug_files,
-				ARRAY_SIZE(debug_files));
-}
+	{ }	/* terminate */
+};
 
 struct cgroup_subsys debug_subsys = {
 	.name = "debug",
 	.create = debug_create,
 	.destroy = debug_destroy,
-	.populate = debug_populate,
 	.subsys_id = debug_subsys_id,
+	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..3649fc6b3eaa 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -358,24 +358,19 @@ static int freezer_write(struct cgroup *cgroup,
 static struct cftype files[] = {
 	{
 		.name = "state",
+		.flags = CFTYPE_NOT_ON_ROOT,
 		.read_seq_string = freezer_read,
 		.write_string = freezer_write,
 	},
+	{ }	/* terminate */
 };
 
-static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
-{
-	if (!cgroup->parent)
-		return 0;
-	return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
-}
-
 struct cgroup_subsys freezer_subsys = {
 	.name		= "freezer",
 	.create		= freezer_create,
 	.destroy	= freezer_destroy,
-	.populate	= freezer_populate,
 	.subsys_id	= freezer_subsys_id,
 	.can_attach	= freezer_can_attach,
 	.fork		= freezer_fork,
+	.base_cftypes	= files,
 };
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b96ad75b7e64..2382683617a3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1765,28 +1765,17 @@ static struct cftype files[] = {
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_SPREAD_SLAB,
 	},
-};
-
-static struct cftype cft_memory_pressure_enabled = {
-	.name = "memory_pressure_enabled",
-	.read_u64 = cpuset_read_u64,
-	.write_u64 = cpuset_write_u64,
-	.private = FILE_MEMORY_PRESSURE_ENABLED,
-};
 
-static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	int err;
+	{
+		.name = "memory_pressure_enabled",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.read_u64 = cpuset_read_u64,
+		.write_u64 = cpuset_write_u64,
+		.private = FILE_MEMORY_PRESSURE_ENABLED,
+	},
 
-	err = cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
-	if (err)
-		return err;
-	/* memory_pressure_enabled is in root cpuset only */
-	if (!cont->parent)
-		err = cgroup_add_file(cont, ss,
-				      &cft_memory_pressure_enabled);
-	return err;
-}
+	{ }	/* terminate */
+};
 
 /*
  * post_clone() is called during cgroup_create() when the
@@ -1887,9 +1876,9 @@ struct cgroup_subsys cpuset_subsys = {
 	.destroy = cpuset_destroy,
 	.can_attach = cpuset_can_attach,
 	.attach = cpuset_attach,
-	.populate = cpuset_populate,
 	.post_clone = cpuset_post_clone,
 	.subsys_id = cpuset_subsys_id,
+	.base_cftypes = files,
 	.early_init = 1,
 };
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30a..afc6d7e71557 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7970,13 +7970,9 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
+	{ }	/* terminate */
 };
 
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-	return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
-
 struct cgroup_subsys cpu_cgroup_subsys = {
 	.name		= "cpu",
 	.create		= cpu_cgroup_create,
@@ -7984,8 +7980,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
-	.populate	= cpu_cgroup_populate,
 	.subsys_id	= cpu_cgroup_subsys_id,
+	.base_cftypes	= cpu_files,
 	.early_init	= 1,
 };
 
@@ -8170,13 +8166,9 @@ static struct cftype files[] = {
 		.name = "stat",
 		.read_map = cpuacct_stats_show,
 	},
+	{ }	/* terminate */
 };
 
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
-
 /*
  * charge this task's execution time to its accounting group.
  *
@@ -8208,7 +8200,7 @@ struct cgroup_subsys cpuacct_subsys = {
 	.name = "cpuacct",
 	.create = cpuacct_create,
 	.destroy = cpuacct_destroy,
-	.populate = cpuacct_populate,
 	.subsys_id = cpuacct_subsys_id,
+	.base_cftypes = files,
 };
 #endif	/* CONFIG_CGROUP_CPUACCT */
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index dbfd4e7f8dc4..b2a59866699f 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -242,21 +242,17 @@ static struct cftype ss_files[] = {
 		.read_map = read_priomap,
 		.write_string = write_priomap,
 	},
+	{ }	/* terminate */
 };
 
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
-}
-
 struct cgroup_subsys net_prio_subsys = {
 	.name		= "net_prio",
 	.create		= cgrp_create,
 	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
 #ifdef CONFIG_NETPRIO_CGROUP
 	.subsys_id	= net_prio_subsys_id,
 #endif
+	.base_cftypes	= ss_files,
 	.module		= THIS_MODULE
 };
 
diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c
index 024df8a32275..7743ea8d1d38 100644
--- a/net/sched/cls_cgroup.c
+++ b/net/sched/cls_cgroup.c
@@ -70,21 +70,17 @@ static struct cftype ss_files[] = {
 		.read_u64 = read_classid,
 		.write_u64 = write_classid,
 	},
+	{ }	/* terminate */
 };
 
-static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-	return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
-}
-
 struct cgroup_subsys net_cls_subsys = {
 	.name		= "net_cls",
 	.create		= cgrp_create,
 	.destroy	= cgrp_destroy,
-	.populate	= cgrp_populate,
 #ifdef CONFIG_NET_CLS_CGROUP
 	.subsys_id	= net_cls_subsys_id,
 #endif
+	.base_cftypes	= ss_files,
 	.module		= THIS_MODULE,
 };
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index c43a3323feea..442204cc22d9 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -447,22 +447,16 @@ static struct cftype dev_cgroup_files[] = {
 		.read_seq_string = devcgroup_seq_read,
 		.private = DEVCG_LIST,
 	},
+	{ }	/* terminate */
 };
 
-static int devcgroup_populate(struct cgroup_subsys *ss,
-				struct cgroup *cgroup)
-{
-	return cgroup_add_files(cgroup, ss, dev_cgroup_files,
-					ARRAY_SIZE(dev_cgroup_files));
-}
-
 struct cgroup_subsys devices_subsys = {
 	.name = "devices",
 	.can_attach = devcgroup_can_attach,
 	.create = devcgroup_create,
 	.destroy = devcgroup_destroy,
-	.populate = devcgroup_populate,
 	.subsys_id = devices_subsys_id,
+	.base_cftypes = dev_cgroup_files,
 };
 
 int __devcgroup_inode_permission(struct inode *inode, int mask)
-- 
cgit v1.2.3-59-g8ed1b


From db0416b64977cb0f382175608286cc80c7573e38 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: remove cgroup_add_file[s]()

No controller is using cgroup_add_files[s]().  Unexport them, and
convert cgroup_add_files() to handle NULL entry terminated array
instead of taking count explicitly and continue creation on failure
for internal use.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h | 16 ----------------
 kernel/cgroup.c        | 51 ++++++++++++++++++++------------------------------
 2 files changed, 20 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index af6211c7a42b..7a3d96755114 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -404,22 +404,6 @@ struct cgroup_scanner {
 	void *data;
 };
 
-/*
- * Add a new file to the given cgroup directory. Should only be
- * called by subsystems from within a populate() method
- */
-int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-		       const struct cftype *cft);
-
-/*
- * Add a set of new files to the given cgroup directory. Should
- * only be called by subsystems from within a populate() method
- */
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count);
-
 int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 835ae29e4ea2..fb71a930ec8d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2615,9 +2615,8 @@ static umode_t cgroup_file_mode(const struct cftype *cft)
 	return mode;
 }
 
-int cgroup_add_file(struct cgroup *cgrp,
-		       struct cgroup_subsys *subsys,
-		       const struct cftype *cft)
+static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+			   const struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
@@ -2649,22 +2648,23 @@ int cgroup_add_file(struct cgroup *cgrp,
 		error = PTR_ERR(dentry);
 	return error;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_file);
 
-int cgroup_add_files(struct cgroup *cgrp,
-			struct cgroup_subsys *subsys,
-			const struct cftype cft[],
-			int count)
+static int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+			    const struct cftype cfts[])
 {
-	int i, err;
-	for (i = 0; i < count; i++) {
-		err = cgroup_add_file(cgrp, subsys, &cft[i]);
-		if (err)
-			return err;
+	const struct cftype *cft;
+	int err, ret = 0;
+
+	for (cft = cfts; cft->name[0] != '\0'; cft++) {
+		err = cgroup_add_file(cgrp, subsys, cft);
+		if (err) {
+			pr_warning("cgroup_add_files: failed to create %s, err=%d\n",
+				   cft->name, err);
+			ret = err;
+		}
 	}
-	return 0;
+	return ret;
 }
-EXPORT_SYMBOL_GPL(cgroup_add_files);
 
 static DEFINE_MUTEX(cgroup_cft_mutex);
 
@@ -2688,10 +2688,6 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 {
 	LIST_HEAD(pending);
 	struct cgroup *cgrp, *n;
-	int count = 0;
-
-	while (cfts[count].name[0] != '\0')
-		count++;
 
 	/* %NULL @cfts indicates abort and don't bother if @ss isn't attached */
 	if (cfts && ss->root != &rootnode) {
@@ -2713,7 +2709,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (!cgroup_is_removed(cgrp))
-			cgroup_add_files(cgrp, ss, cfts, count);
+			cgroup_add_files(cgrp, ss, cfts);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 
@@ -3739,6 +3735,7 @@ static struct cftype files[] = {
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
 	},
+	{ }	/* terminate */
 };
 
 static int cgroup_populate_dir(struct cgroup *cgrp)
@@ -3746,7 +3743,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	int err;
 	struct cgroup_subsys *ss;
 
-	err = cgroup_add_files(cgrp, NULL, files, ARRAY_SIZE(files));
+	err = cgroup_add_files(cgrp, NULL, files);
 	if (err < 0)
 		return err;
 
@@ -3757,16 +3754,8 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 
-		list_for_each_entry(set, &ss->cftsets, node) {
-			const struct cftype *cft;
-
-			for (cft = set->cfts; cft->name[0] != '\0'; cft++) {
-				err = cgroup_add_file(cgrp, ss, cft);
-				if (err)
-					pr_warning("cgroup_populate_dir: failed to create %s, err=%d\n",
-						   cft->name, err);
-			}
-		}
+		list_for_each_entry(set, &ss->cftsets, node)
+			cgroup_add_files(cgrp, ss, set->cfts);
 	}
 
 	/* This cgroup is ready now */
-- 
cgit v1.2.3-59-g8ed1b


From f6ea93723d0049ae5fbbb5292cb10c51d7a80801 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:55 -0700
Subject: cgroup: relocate __d_cgrp() and __d_cft()

Move the two macros upwards as they'll be used earlier in the file.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/cgroup.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fb71a930ec8d..2ab29eb381b7 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -282,6 +282,16 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+static inline struct cgroup *__d_cgrp(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
@@ -1704,16 +1714,6 @@ static struct file_system_type cgroup_fs_type = {
 
 static struct kobject *cgroup_kobj;
 
-static inline struct cgroup *__d_cgrp(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
-static inline struct cftype *__d_cft(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
 /**
  * cgroup_path - generate the path of a cgroup
  * @cgrp: the cgroup in question
-- 
cgit v1.2.3-59-g8ed1b


From 05ef1d7c4a5b6d09cadd2b8e9b3c395363d1a89c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: introduce struct cfent

This patch adds cfent (cgroup file entry) which is the association
between a cgroup and a file.  This is in-cgroup representation of
files under a cgroup directory.  This simplifies walking walking
cgroup files and thus cgroup_clear_directory(), which is now
implemented in two parts - cgroup_rm_file() and a loop around it.

cgroup_rm_file() will be used to implement cftype removal and cfent is
scheduled to serve cgroup specific per-file data (e.g. for sysfs-like
"sever" semantics).

v2: - cfe was freed from cgroup_rm_file() which led to use-after-free
      if the file had openers at the time of removal.  Moved to
      cgroup_diput().

    - cgroup_clear_directory() triggered WARN_ON_ONCE() if d_subdirs
      wasn't empty after removing all files.  This triggered
      spuriously if some files were open during directory clearing.
      Removed.

v3: - In cgroup_diput(), WARN_ONCE(!list_empty(&cfe->node)) could be
      spuriously triggered for root cgroups because they don't go
      through cgroup_clear_directory() on unmount.  Don't trigger WARN
      for root cgroups.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Glauber Costa <glommer@parallels.com>
---
 include/linux/cgroup.h |   1 +
 kernel/cgroup.c        | 113 +++++++++++++++++++++++++++++++++----------------
 2 files changed, 78 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7a3d96755114..87b034ed0c31 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -175,6 +175,7 @@ struct cgroup {
 	 */
 	struct list_head sibling;	/* my parent's children */
 	struct list_head children;	/* my children */
+	struct list_head files;		/* my files */
 
 	struct cgroup *parent;		/* my parent */
 	struct dentry __rcu *dentry;	/* cgroup fs entry, RCU protected */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2ab29eb381b7..7d5d1c927d9d 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -147,6 +147,15 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * cgroupfs file entry, pointed to from leaf dentry->d_fsdata.
+ */
+struct cfent {
+	struct list_head		node;
+	struct dentry			*dentry;
+	struct cftype			*type;
+};
+
 /*
  * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
  * cgroup_subsys->use_id != 0.
@@ -287,11 +296,16 @@ static inline struct cgroup *__d_cgrp(struct dentry *dentry)
 	return dentry->d_fsdata;
 }
 
-static inline struct cftype *__d_cft(struct dentry *dentry)
+static inline struct cfent *__d_cfe(struct dentry *dentry)
 {
 	return dentry->d_fsdata;
 }
 
+static inline struct cftype *__d_cft(struct dentry *dentry)
+{
+	return __d_cfe(dentry)->type;
+}
+
 /* the list of cgroups eligible for automatic release. Protected by
  * release_list_lock */
 static LIST_HEAD(release_list);
@@ -877,6 +891,14 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 		BUG_ON(!list_empty(&cgrp->pidlists));
 
 		kfree_rcu(cgrp, rcu_head);
+	} else {
+		struct cfent *cfe = __d_cfe(dentry);
+		struct cgroup *cgrp = dentry->d_parent->d_fsdata;
+
+		WARN_ONCE(!list_empty(&cfe->node) &&
+			  cgrp != &cgrp->root->top_cgroup,
+			  "cfe still linked for %s\n", cfe->type->name);
+		kfree(cfe);
 	}
 	iput(inode);
 }
@@ -895,34 +917,36 @@ static void remove_dir(struct dentry *d)
 	dput(parent);
 }
 
-static void cgroup_clear_directory(struct dentry *dentry)
-{
-	struct list_head *node;
-
-	BUG_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
-	spin_lock(&dentry->d_lock);
-	node = dentry->d_subdirs.next;
-	while (node != &dentry->d_subdirs) {
-		struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
-
-		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-		list_del_init(node);
-		if (d->d_inode) {
-			/* This should never be called on a cgroup
-			 * directory with child cgroups */
-			BUG_ON(d->d_inode->i_mode & S_IFDIR);
-			dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			spin_unlock(&dentry->d_lock);
-			d_delete(d);
-			simple_unlink(dentry->d_inode, d);
-			dput(d);
-			spin_lock(&dentry->d_lock);
-		} else
-			spin_unlock(&d->d_lock);
-		node = dentry->d_subdirs.next;
+static int cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft)
+{
+	struct cfent *cfe;
+
+	lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex);
+	lockdep_assert_held(&cgroup_mutex);
+
+	list_for_each_entry(cfe, &cgrp->files, node) {
+		struct dentry *d = cfe->dentry;
+
+		if (cft && cfe->type != cft)
+			continue;
+
+		dget(d);
+		d_delete(d);
+		simple_unlink(d->d_inode, d);
+		list_del_init(&cfe->node);
+		dput(d);
+
+		return 0;
 	}
-	spin_unlock(&dentry->d_lock);
+	return -ENOENT;
+}
+
+static void cgroup_clear_directory(struct dentry *dir)
+{
+	struct cgroup *cgrp = __d_cgrp(dir);
+
+	while (!list_empty(&cgrp->files))
+		cgroup_rm_file(cgrp, NULL);
 }
 
 /*
@@ -1352,6 +1376,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 {
 	INIT_LIST_HEAD(&cgrp->sibling);
 	INIT_LIST_HEAD(&cgrp->children);
+	INIT_LIST_HEAD(&cgrp->files);
 	INIT_LIST_HEAD(&cgrp->css_sets);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
@@ -2619,7 +2644,9 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			   const struct cftype *cft)
 {
 	struct dentry *dir = cgrp->dentry;
+	struct cgroup *parent = __d_cgrp(dir);
 	struct dentry *dentry;
+	struct cfent *cfe;
 	int error;
 	umode_t mode;
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
@@ -2635,17 +2662,31 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 		strcat(name, ".");
 	}
 	strcat(name, cft->name);
+
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
+
+	cfe = kzalloc(sizeof(*cfe), GFP_KERNEL);
+	if (!cfe)
+		return -ENOMEM;
+
 	dentry = lookup_one_len(name, dir, strlen(name));
-	if (!IS_ERR(dentry)) {
-		mode = cgroup_file_mode(cft);
-		error = cgroup_create_file(dentry, mode | S_IFREG,
-						cgrp->root->sb);
-		if (!error)
-			dentry->d_fsdata = (void *)cft;
-		dput(dentry);
-	} else
+	if (IS_ERR(dentry)) {
 		error = PTR_ERR(dentry);
+		goto out;
+	}
+
+	mode = cgroup_file_mode(cft);
+	error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb);
+	if (!error) {
+		cfe->type = (void *)cft;
+		cfe->dentry = dentry;
+		dentry->d_fsdata = cfe;
+		list_add_tail(&cfe->node, &parent->files);
+		cfe = NULL;
+	}
+	dput(dentry);
+out:
+	kfree(cfe);
 	return error;
 }
 
-- 
cgit v1.2.3-59-g8ed1b


From 79578621b4847afdef48d19a28d00e3b188c37e1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: implement cgroup_rm_cftypes()

Implement cgroup_rm_cftypes() which removes an array of cftypes from a
subsystem.  It can be called whether the target subsys is attached or
not.  cgroup core will remove the specified file from all existing
cgroups.

This will be used to improve sub-subsys modularity and will be helpful
for unified hierarchy.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  1 +
 kernel/cgroup.c        | 54 ++++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 45 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 87b034ed0c31..028478c6e0c5 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -406,6 +406,7 @@ struct cgroup_scanner {
 };
 
 int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts);
 
 int cgroup_is_removed(const struct cgroup *cgrp);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7d5d1c927d9d..21bba7722350 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2690,17 +2690,20 @@ out:
 	return error;
 }
 
-static int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
-			    const struct cftype cfts[])
+static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
+			      const struct cftype cfts[], bool is_add)
 {
 	const struct cftype *cft;
 	int err, ret = 0;
 
 	for (cft = cfts; cft->name[0] != '\0'; cft++) {
-		err = cgroup_add_file(cgrp, subsys, cft);
+		if (is_add)
+			err = cgroup_add_file(cgrp, subsys, cft);
+		else
+			err = cgroup_rm_file(cgrp, cft);
 		if (err) {
-			pr_warning("cgroup_add_files: failed to create %s, err=%d\n",
-				   cft->name, err);
+			pr_warning("cgroup_addrm_files: failed to %s %s, err=%d\n",
+				   is_add ? "add" : "remove", cft->name, err);
 			ret = err;
 		}
 	}
@@ -2724,7 +2727,7 @@ static void cgroup_cfts_prepare(void)
 }
 
 static void cgroup_cfts_commit(struct cgroup_subsys *ss,
-			       const struct cftype *cfts)
+			       const struct cftype *cfts, bool is_add)
 	__releases(&cgroup_mutex) __releases(&cgroup_cft_mutex)
 {
 	LIST_HEAD(pending);
@@ -2750,7 +2753,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss,
 		mutex_lock(&inode->i_mutex);
 		mutex_lock(&cgroup_mutex);
 		if (!cgroup_is_removed(cgrp))
-			cgroup_add_files(cgrp, ss, cfts);
+			cgroup_addrm_files(cgrp, ss, cfts, is_add);
 		mutex_unlock(&cgroup_mutex);
 		mutex_unlock(&inode->i_mutex);
 
@@ -2786,12 +2789,43 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
 	cgroup_cfts_prepare();
 	set->cfts = cfts;
 	list_add_tail(&set->node, &ss->cftsets);
-	cgroup_cfts_commit(ss, cfts);
+	cgroup_cfts_commit(ss, cfts, true);
 
 	return 0;
 }
 EXPORT_SYMBOL_GPL(cgroup_add_cftypes);
 
+/**
+ * cgroup_rm_cftypes - remove an array of cftypes from a subsystem
+ * @ss: target cgroup subsystem
+ * @cfts: zero-length name terminated array of cftypes
+ *
+ * Unregister @cfts from @ss.  Files described by @cfts are removed from
+ * all existing cgroups to which @ss is attached and all future cgroups
+ * won't have them either.  This function can be called anytime whether @ss
+ * is attached or not.
+ *
+ * Returns 0 on successful unregistration, -ENOENT if @cfts is not
+ * registered with @ss.
+ */
+int cgroup_rm_cftypes(struct cgroup_subsys *ss, const struct cftype *cfts)
+{
+	struct cftype_set *set;
+
+	cgroup_cfts_prepare();
+
+	list_for_each_entry(set, &ss->cftsets, node) {
+		if (set->cfts == cfts) {
+			list_del_init(&set->node);
+			cgroup_cfts_commit(ss, cfts, false);
+			return 0;
+		}
+	}
+
+	cgroup_cfts_commit(ss, NULL, false);
+	return -ENOENT;
+}
+
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
  * @cgrp: the cgroup in question
@@ -3784,7 +3818,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	int err;
 	struct cgroup_subsys *ss;
 
-	err = cgroup_add_files(cgrp, NULL, files);
+	err = cgroup_addrm_files(cgrp, NULL, files, true);
 	if (err < 0)
 		return err;
 
@@ -3796,7 +3830,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 			return err;
 
 		list_for_each_entry(set, &ss->cftsets, node)
-			cgroup_add_files(cgrp, ss, set->cfts);
+			cgroup_addrm_files(cgrp, ss, set->cfts, true);
 	}
 
 	/* This cgroup is ready now */
-- 
cgit v1.2.3-59-g8ed1b


From 28b4c27b8e6bb6d7ff2875281a8484f8898a87ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: use negative bias on css->refcnt to block css_tryget()

When a cgroup is about to be removed, cgroup_clear_css_refs() is
called to check and ensure that there are no active css references.

This is currently achieved by dropping the refcnt to zero iff it has
only the base ref.  If all css refs could be dropped to zero, ref
clearing is successful and CSS_REMOVED is set on all css.  If not, the
base ref is restored.  While css ref is zero w/o CSS_REMOVED set, any
css_tryget() attempt on it busy loops so that they are atomic
w.r.t. the whole css ref clearing.

This does work but dropping and re-instating the base ref is somewhat
hairy and makes it difficult to add more logic to the put path as
there are two of them - the regular css_put() and the reversible base
ref clearing.

This patch updates css ref clearing such that blocking new
css_tryget() and putting the base ref are separate operations.
CSS_DEACT_BIAS, defined as INT_MIN, is added to css->refcnt and
css_tryget() busy loops while refcnt is negative.  After all css refs
are deactivated, if they were all one, ref clearing succeeded and
CSS_REMOVED is set and the base ref is put using the regular
css_put(); otherwise, CSS_DEACT_BIAS is subtracted from the refcnts
and the original postive values are restored.

css_refcnt() accessor which always returns the unbiased positive
reference counts is added and used to simplify refcnt usages.  While
at it, relocate and reformat comments in cgroup_has_css_refs().

This separates css->refcnt deactivation and putting the base ref,
which enables the next patch to make ref clearing optional.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
---
 include/linux/cgroup.h |  12 ++---
 kernel/cgroup.c        | 119 +++++++++++++++++++++++++++++--------------------
 2 files changed, 75 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 028478c6e0c5..be81fafae11f 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -115,16 +115,12 @@ static inline bool css_is_removed(struct cgroup_subsys_state *css)
  * the css has been destroyed.
  */
 
+extern bool __css_tryget(struct cgroup_subsys_state *css);
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (test_bit(CSS_ROOT, &css->flags))
 		return true;
-	while (!atomic_inc_not_zero(&css->refcnt)) {
-		if (test_bit(CSS_REMOVED, &css->flags))
-			return false;
-		cpu_relax();
-	}
-	return true;
+	return __css_tryget(css);
 }
 
 /*
@@ -132,11 +128,11 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  * css_get() or css_tryget()
  */
 
-extern void __css_put(struct cgroup_subsys_state *css, int count);
+extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!test_bit(CSS_ROOT, &css->flags))
-		__css_put(css, 1);
+		__css_put(css);
 }
 
 /* bits in struct cgroup flags field */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 21bba7722350..2eade5186604 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -63,6 +63,9 @@
 
 #include <linux/atomic.h>
 
+/* css deactivation bias, makes css->refcnt negative to deny new trygets */
+#define CSS_DEACT_BIAS		INT_MIN
+
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -251,6 +254,14 @@ int cgroup_lock_is_held(void)
 
 EXPORT_SYMBOL_GPL(cgroup_lock_is_held);
 
+/* the current nr of refs, always >= 0 whether @css is deactivated or not */
+static int css_refcnt(struct cgroup_subsys_state *css)
+{
+	int v = atomic_read(&css->refcnt);
+
+	return v >= 0 ? v : v - CSS_DEACT_BIAS;
+}
+
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -4006,18 +4017,19 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+/*
+ * Check the reference count on each subsystem. Since we already
+ * established that there are no tasks in the cgroup, if the css refcount
+ * is also 1, then there should be no outstanding references, so the
+ * subsystem is safe to destroy. We scan across all subsystems rather than
+ * using the per-hierarchy linked list of mounted subsystems since we can
+ * be called via check_for_release() with no synchronization other than
+ * RCU, and the subsystem linked list isn't RCU-safe.
+ */
 static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
-	/* Check the reference count on each subsystem. Since we
-	 * already established that there are no tasks in the
-	 * cgroup, if the css refcount is also 1, then there should
-	 * be no outstanding references, so the subsystem is safe to
-	 * destroy. We scan across all subsystems rather than using
-	 * the per-hierarchy linked list of mounted subsystems since
-	 * we can be called via check_for_release() with no
-	 * synchronization other than RCU, and the subsystem linked
-	 * list isn't RCU-safe */
 	int i;
+
 	/*
 	 * We won't need to lock the subsys array, because the subsystems
 	 * we're concerned about aren't going anywhere since our cgroup root
@@ -4026,17 +4038,21 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		struct cgroup_subsys_state *css;
+
 		/* Skip subsystems not present or not in this hierarchy */
 		if (ss == NULL || ss->root != cgrp->root)
 			continue;
+
 		css = cgrp->subsys[ss->subsys_id];
-		/* When called from check_for_release() it's possible
+		/*
+		 * When called from check_for_release() it's possible
 		 * that by this point the cgroup has been removed
 		 * and the css deleted. But a false-positive doesn't
 		 * matter, since it can only happen if the cgroup
 		 * has been deleted and hence no longer needs the
-		 * release agent to be called anyway. */
-		if (css && (atomic_read(&css->refcnt) > 1))
+		 * release agent to be called anyway.
+		 */
+		if (css && css_refcnt(css) > 1)
 			return 1;
 	}
 	return 0;
@@ -4053,44 +4069,37 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	unsigned long flags;
 	bool failed = false;
+
 	local_irq_save(flags);
+
+	/*
+	 * Block new css_tryget() by deactivating refcnt.  If all refcnts
+	 * were 1 at the moment of deactivation, we succeeded.
+	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-		int refcnt;
-		while (1) {
-			/* We can only remove a CSS with a refcnt==1 */
-			refcnt = atomic_read(&css->refcnt);
-			if (refcnt > 1) {
-				failed = true;
-				goto done;
-			}
-			BUG_ON(!refcnt);
-			/*
-			 * Drop the refcnt to 0 while we check other
-			 * subsystems. This will cause any racing
-			 * css_tryget() to spin until we set the
-			 * CSS_REMOVED bits or abort
-			 */
-			if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
-				break;
-			cpu_relax();
-		}
+
+		WARN_ON(atomic_read(&css->refcnt) < 0);
+		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
+		failed |= css_refcnt(css) != 1;
 	}
- done:
+
+	/*
+	 * If succeeded, set REMOVED and put all the base refs; otherwise,
+	 * restore refcnts to positive values.  Either way, all in-progress
+	 * css_tryget() will be released.
+	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
-		if (failed) {
-			/*
-			 * Restore old refcnt if we previously managed
-			 * to clear it from 1 to 0
-			 */
-			if (!atomic_read(&css->refcnt))
-				atomic_set(&css->refcnt, 1);
-		} else {
-			/* Commit the fact that the CSS is removed */
+
+		if (!failed) {
 			set_bit(CSS_REMOVED, &css->flags);
+			css_put(css);
+		} else {
+			atomic_sub(CSS_DEACT_BIAS, &css->refcnt);
 		}
 	}
+
 	local_irq_restore(flags);
 	return !failed;
 }
@@ -4887,13 +4896,28 @@ static void check_for_release(struct cgroup *cgrp)
 }
 
 /* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css, int count)
+bool __css_tryget(struct cgroup_subsys_state *css)
+{
+	do {
+		int v = css_refcnt(css);
+
+		if (atomic_cmpxchg(&css->refcnt, v, v + 1) == v)
+			return true;
+		cpu_relax();
+	} while (!test_bit(CSS_REMOVED, &css->flags));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(__css_tryget);
+
+/* Caller must verify that the css is not for root cgroup */
+void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
-	int val;
+
 	rcu_read_lock();
-	val = atomic_sub_return(count, &css->refcnt);
-	if (val == 1) {
+	atomic_dec(&css->refcnt);
+	if (css_refcnt(css) == 1) {
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
@@ -4901,7 +4925,6 @@ void __css_put(struct cgroup_subsys_state *css, int count)
 		cgroup_wakeup_rmdir_waiter(cgrp);
 	}
 	rcu_read_unlock();
-	WARN_ON_ONCE(val < 1);
 }
 EXPORT_SYMBOL_GPL(__css_put);
 
@@ -5020,7 +5043,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+	cssid = rcu_dereference_check(css->id, css_refcnt(css));
 
 	if (cssid)
 		return cssid->id;
@@ -5032,7 +5055,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 {
 	struct css_id *cssid;
 
-	cssid = rcu_dereference_check(css->id, atomic_read(&css->refcnt));
+	cssid = rcu_dereference_check(css->id, css_refcnt(css));
 
 	if (cssid)
 		return cssid->depth;
-- 
cgit v1.2.3-59-g8ed1b


From 48ddbe194623ae089cc0576e60363f2d2e85662a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 1 Apr 2012 12:09:56 -0700
Subject: cgroup: make css->refcnt clearing on cgroup removal optional

Currently, cgroup removal tries to drain all css references.  If there
are active css references, the removal logic waits and retries
->pre_detroy() until either all refs drop to zero or removal is
cancelled.

This semantics is unusual and adds non-trivial complexity to cgroup
core and IMHO is fundamentally misguided in that it couples internal
implementation details (references to internal data structure) with
externally visible operation (rmdir).  To userland, this is a behavior
peculiarity which is unnecessary and difficult to expect (css refs is
otherwise invisible from userland), and, to policy implementations,
this is an unnecessary restriction (e.g. blkcg wants to hold css refs
for caching purposes but can't as that becomes visible as rmdir hang).

Unfortunately, memcg currently depends on ->pre_destroy() retrials and
cgroup removal vetoing and can't be immmediately switched to the new
behavior.  This patch introduces the new behavior of not waiting for
css refs to drain and maintains the old behavior for subsystems which
have __DEPRECATED_clear_css_refs set.

Once, memcg is updated, we can drop the code paths for the old
behavior as proposed in the following patch.  Note that the following
patch is incorrect in that dput work item is in cgroup and may lose
some of dputs when multiples css's are released back-to-back, and
__css_put() triggers check_for_release() when refcnt reaches 0 instead
of 1; however, it shows what part can be removed.

  http://thread.gmane.org/gmane.linux.kernel.containers/22559/focus=75251

Note that, in not-too-distant future, cgroup core will start emitting
warning messages for subsys which require the old behavior, so please
get moving.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Balbir Singh <bsingharora@gmail.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
---
 include/linux/cgroup.h | 17 ++++++++++++
 kernel/cgroup.c        | 71 +++++++++++++++++++++++++++++++++++++++++++-------
 mm/memcontrol.c        |  1 +
 3 files changed, 80 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index be81fafae11f..565c8034e6c8 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -16,6 +16,7 @@
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
 #include <linux/idr.h>
+#include <linux/workqueue.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -76,12 +77,16 @@ struct cgroup_subsys_state {
 	unsigned long flags;
 	/* ID for this css, if possible */
 	struct css_id __rcu *id;
+
+	/* Used to put @cgroup->dentry on the last css_put() */
+	struct work_struct dput_work;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
 enum {
 	CSS_ROOT, /* This CSS is the root of the subsystem */
 	CSS_REMOVED, /* This CSS is dead */
+	CSS_CLEAR_CSS_REFS,		/* @ss->__DEPRECATED_clear_css_refs */
 };
 
 /* Caller must verify that the css is not for root cgroup */
@@ -480,6 +485,18 @@ struct cgroup_subsys {
 	 * (not available in early_init time.)
 	 */
 	bool use_id;
+
+	/*
+	 * If %true, cgroup removal will try to clear css refs by retrying
+	 * ss->pre_destroy() until there's no css ref left.  This behavior
+	 * is strictly for backward compatibility and will be removed as
+	 * soon as the current user (memcg) is updated.
+	 *
+	 * If %false, ss->pre_destroy() can't fail and cgroup removal won't
+	 * wait for css refs to drop to zero before proceeding.
+	 */
+	bool __DEPRECATED_clear_css_refs;
+
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2eade5186604..2905977e0f33 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -854,12 +854,17 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 	struct cgroup_subsys *ss;
 	int ret = 0;
 
-	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy) {
-			ret = ss->pre_destroy(cgrp);
-			if (ret)
-				break;
+	for_each_subsys(cgrp->root, ss) {
+		if (!ss->pre_destroy)
+			continue;
+
+		ret = ss->pre_destroy(cgrp);
+		if (ret) {
+			/* ->pre_destroy() failure is being deprecated */
+			WARN_ON_ONCE(!ss->__DEPRECATED_clear_css_refs);
+			break;
 		}
+	}
 
 	return ret;
 }
@@ -3859,6 +3864,14 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	return 0;
 }
 
+static void css_dput_fn(struct work_struct *work)
+{
+	struct cgroup_subsys_state *css =
+		container_of(work, struct cgroup_subsys_state, dput_work);
+
+	dput(css->cgroup->dentry);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
@@ -3871,6 +3884,16 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
 	cgrp->subsys[ss->subsys_id] = css;
+
+	/*
+	 * If !clear_css_refs, css holds an extra ref to @cgrp->dentry
+	 * which is put on the last css_put().  dput() requires process
+	 * context, which css_put() may be called without.  @css->dput_work
+	 * will be used to invoke dput() asynchronously from css_put().
+	 */
+	INIT_WORK(&css->dput_work, css_dput_fn);
+	if (ss->__DEPRECATED_clear_css_refs)
+		set_bit(CSS_CLEAR_CSS_REFS, &css->flags);
 }
 
 static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
@@ -3973,6 +3996,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (err < 0)
 		goto err_remove;
 
+	/* If !clear_css_refs, each css holds a ref to the cgroup's dentry */
+	for_each_subsys(root, ss)
+		if (!ss->__DEPRECATED_clear_css_refs)
+			dget(dentry);
+
 	/* The cgroup directory was pre-locked for us */
 	BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
 
@@ -4062,8 +4090,24 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
  * Atomically mark all (or else none) of the cgroup's CSS objects as
  * CSS_REMOVED. Return true on success, or false if the cgroup has
  * busy subsystems. Call with cgroup_mutex held
+ *
+ * Depending on whether a subsys has __DEPRECATED_clear_css_refs set or
+ * not, cgroup removal behaves differently.
+ *
+ * If clear is set, css refcnt for the subsystem should be zero before
+ * cgroup removal can be committed.  This is implemented by
+ * CGRP_WAIT_ON_RMDIR and retry logic around ->pre_destroy(), which may be
+ * called multiple times until all css refcnts reach zero and is allowed to
+ * veto removal on any invocation.  This behavior is deprecated and will be
+ * removed as soon as the existing user (memcg) is updated.
+ *
+ * If clear is not set, each css holds an extra reference to the cgroup's
+ * dentry and cgroup removal proceeds regardless of css refs.
+ * ->pre_destroy() will be called at least once and is not allowed to fail.
+ * On the last put of each css, whenever that may be, the extra dentry ref
+ * is put so that dentry destruction happens only after all css's are
+ * released.
  */
-
 static int cgroup_clear_css_refs(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
@@ -4074,14 +4118,17 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
 
 	/*
 	 * Block new css_tryget() by deactivating refcnt.  If all refcnts
-	 * were 1 at the moment of deactivation, we succeeded.
+	 * for subsystems w/ clear_css_refs set were 1 at the moment of
+	 * deactivation, we succeeded.
 	 */
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 
 		WARN_ON(atomic_read(&css->refcnt) < 0);
 		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-		failed |= css_refcnt(css) != 1;
+
+		if (ss->__DEPRECATED_clear_css_refs)
+			failed |= css_refcnt(css) != 1;
 	}
 
 	/*
@@ -4917,12 +4964,18 @@ void __css_put(struct cgroup_subsys_state *css)
 
 	rcu_read_lock();
 	atomic_dec(&css->refcnt);
-	if (css_refcnt(css) == 1) {
+	switch (css_refcnt(css)) {
+	case 1:
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
 		}
 		cgroup_wakeup_rmdir_waiter(cgrp);
+		break;
+	case 0:
+		if (!test_bit(CSS_CLEAR_CSS_REFS, &css->flags))
+			schedule_work(&css->dput_work);
+		break;
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index bef114258bbd..d28359cd6b55 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5635,6 +5635,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
 	.base_cftypes = mem_cgroup_files,
 	.early_init = 0,
 	.use_id = 1,
+	.__DEPRECATED_clear_css_refs = true,
 };
 
 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
-- 
cgit v1.2.3-59-g8ed1b


From 86f82d561864e902c70282b6f17cf590c0f34691 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Apr 2012 10:16:36 -0700
Subject: cgroup: remove cgroup_subsys->populate()

With memcg converted, cgroup_subsys->populate() doesn't have any user
left.  Remove it.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Li Zefan <lizefan@huawei.com>
---
 include/linux/cgroup.h | 1 -
 kernel/cgroup.c        | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 565c8034e6c8..d3f5fba2c159 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -472,7 +472,6 @@ struct cgroup_subsys {
 	void (*fork)(struct task_struct *task);
 	void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		     struct task_struct *task);
-	int (*populate)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*post_clone)(struct cgroup *cgrp);
 	void (*bind)(struct cgroup *root);
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 2905977e0f33..b2f203f25ec8 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3842,9 +3842,6 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 	for_each_subsys(cgrp->root, ss) {
 		struct cftype_set *set;
 
-		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
-			return err;
-
 		list_for_each_entry(set, &ss->cftsets, node)
 			cgroup_addrm_files(cgrp, ss, set->cfts, true);
 	}
-- 
cgit v1.2.3-59-g8ed1b


From c4c27fbdda4e8ba87806c415b6d15266b07bce4b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <mgalbraith@suse.de>
Date: Sat, 21 Apr 2012 09:13:46 +0200
Subject: cgroups: disallow attaching kthreadd or PF_THREAD_BOUND threads

Allowing kthreadd to be moved to a non-root group makes no sense, it being
a global resource, and needlessly leads unsuspecting users toward trouble.

1. An RT workqueue worker thread spawned in a task group with no rt_runtime
allocated is not schedulable.  Simple user error, but harmful to the box.

2. A worker thread which acquires PF_THREAD_BOUND can never leave a cpuset,
rendering the cpuset immortal.

Save the user some unexpected trouble, just say no.

Signed-off-by: Mike Galbraith <mgalbraith@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Li Zefan <lizefan@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/cgroup.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b2f203f25ec8..ad8eae5bb801 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
 #include <linux/eventfd.h>
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/kthread.h>
 
 #include <linux/atomic.h>
 
@@ -2225,6 +2226,18 @@ retry_find_task:
 
 	if (threadgroup)
 		tsk = tsk->group_leader;
+
+	/*
+	 * Workqueue threads may acquire PF_THREAD_BOUND and become
+	 * trapped in a cpuset, or RT worker may be born in a cgroup
+	 * with no rt_runtime allocated.  Just say no.
+	 */
+	if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+		ret = -EINVAL;
+		rcu_read_unlock();
+		goto out_unlock_cgroup;
+	}
+
 	get_task_struct(tsk);
 	rcu_read_unlock();
 
-- 
cgit v1.2.3-59-g8ed1b


From 4d8438f044d8aaac6fbba98316ba484dabea397d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Apr 2012 01:11:35 +0200
Subject: res_counter: Merge res_counter_charge and res_counter_charge_nofail

These two functions do almost the same thing and duplicate some code.
Merge their implementation into a single common function.
res_counter_charge_locked() takes one more parameter but it doesn't seem
to be used outside res_counter.c yet anyway.

There is no (intended) change in the behaviour.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Li Zefan <lizefan@huawei.com>
---
 Documentation/cgroups/resource_counter.txt |  4 +-
 include/linux/res_counter.h                |  2 +-
 kernel/res_counter.c                       | 73 ++++++++++++++----------------
 3 files changed, 37 insertions(+), 42 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/resource_counter.txt b/Documentation/cgroups/resource_counter.txt
index 95b24d766eab..f3c4ec3626a2 100644
--- a/Documentation/cgroups/resource_counter.txt
+++ b/Documentation/cgroups/resource_counter.txt
@@ -77,11 +77,11 @@ to work with it.
 	where the charging failed.
 
  d. int res_counter_charge_locked
-			(struct res_counter *rc, unsigned long val)
+			(struct res_counter *rc, unsigned long val, bool force)
 
 	The same as res_counter_charge(), but it must not acquire/release the
 	res_counter->lock internally (it must be called with res_counter->lock
-	held).
+	held). The force parameter indicates whether we can bypass the limit.
 
  e. void res_counter_uncharge[_locked]
 			(struct res_counter *rc, unsigned long val)
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index da81af086eaf..fb201896a8b0 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -116,7 +116,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
  */
 
 int __must_check res_counter_charge_locked(struct res_counter *counter,
-		unsigned long val);
+					   unsigned long val, bool force);
 int __must_check res_counter_charge(struct res_counter *counter,
 		unsigned long val, struct res_counter **limit_fail_at);
 int __must_check res_counter_charge_nofail(struct res_counter *counter,
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d508363858b3..07a29923aba2 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -22,75 +22,70 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent)
 	counter->parent = parent;
 }
 
-int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
+int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
+			      bool force)
 {
+	int ret = 0;
+
 	if (counter->usage + val > counter->limit) {
 		counter->failcnt++;
-		return -ENOMEM;
+		ret = -ENOMEM;
+		if (!force)
+			return ret;
 	}
 
 	counter->usage += val;
-	if (counter->usage > counter->max_usage)
+	if (!force && counter->usage > counter->max_usage)
 		counter->max_usage = counter->usage;
-	return 0;
+	return ret;
 }
 
-int res_counter_charge(struct res_counter *counter, unsigned long val,
-			struct res_counter **limit_fail_at)
+static int __res_counter_charge(struct res_counter *counter, unsigned long val,
+				struct res_counter **limit_fail_at, bool force)
 {
-	int ret;
+	int ret, r;
 	unsigned long flags;
 	struct res_counter *c, *u;
 
+	r = ret = 0;
 	*limit_fail_at = NULL;
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
-		ret = res_counter_charge_locked(c, val);
+		r = res_counter_charge_locked(c, val, force);
 		spin_unlock(&c->lock);
-		if (ret < 0) {
+		if (r < 0 && !ret) {
+			ret = r;
 			*limit_fail_at = c;
-			goto undo;
+			if (!force)
+				break;
 		}
 	}
-	ret = 0;
-	goto done;
-undo:
-	for (u = counter; u != c; u = u->parent) {
-		spin_lock(&u->lock);
-		res_counter_uncharge_locked(u, val);
-		spin_unlock(&u->lock);
+
+	if (ret < 0 && !force) {
+		for (u = counter; u != c; u = u->parent) {
+			spin_lock(&u->lock);
+			res_counter_uncharge_locked(u, val);
+			spin_unlock(&u->lock);
+		}
 	}
-done:
 	local_irq_restore(flags);
+
 	return ret;
 }
 
+int res_counter_charge(struct res_counter *counter, unsigned long val,
+			struct res_counter **limit_fail_at)
+{
+	return __res_counter_charge(counter, val, limit_fail_at, false);
+}
+
 int res_counter_charge_nofail(struct res_counter *counter, unsigned long val,
 			      struct res_counter **limit_fail_at)
 {
-	int ret, r;
-	unsigned long flags;
-	struct res_counter *c;
-
-	r = ret = 0;
-	*limit_fail_at = NULL;
-	local_irq_save(flags);
-	for (c = counter; c != NULL; c = c->parent) {
-		spin_lock(&c->lock);
-		r = res_counter_charge_locked(c, val);
-		if (r)
-			c->usage += val;
-		spin_unlock(&c->lock);
-		if (r < 0 && ret == 0) {
-			*limit_fail_at = c;
-			ret = r;
-		}
-	}
-	local_irq_restore(flags);
-
-	return ret;
+	return __res_counter_charge(counter, val, limit_fail_at, true);
 }
+
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 {
 	if (WARN_ON(counter->usage < val))
-- 
cgit v1.2.3-59-g8ed1b


From 0d4dde1ac9a5af74ac76c6ab90557d1ae7b8f5d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Apr 2012 01:11:36 +0200
Subject: res_counter: Account max_usage when calling
 res_counter_charge_nofail()

Updating max_usage is something one would expect when we reach
a new maximum usage value even when we do this by forcing through
the limit with res_counter_charge_nofail().

(Whether we want to account failcnt when we force through the limit
is another debate).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Glauber Costa <glommer@parallels.com>
Acked-by: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Li Zefan <lizefan@huawei.com>
---
 kernel/res_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 07a29923aba2..bebe2b170d49 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -35,7 +35,7 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val,
 	}
 
 	counter->usage += val;
-	if (!force && counter->usage > counter->max_usage)
+	if (counter->usage > counter->max_usage)
 		counter->max_usage = counter->usage;
 	return ret;
 }
-- 
cgit v1.2.3-59-g8ed1b