From 7e47682ea555e7c1edef1d8fd96e2aa4c12abe59 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 9 Jun 2015 21:32:09 +1000 Subject: cgroup: allow a cgroup subsystem to reject a fork Add a new cgroup subsystem callback can_fork that conditionally states whether or not the fork is accepted or rejected by a cgroup policy. In addition, add a cancel_fork callback so that if an error occurs later in the forking process, any state modified by can_fork can be reverted. Allow for a private opaque pointer to be passed from cgroup_can_fork to cgroup_post_fork, allowing for the fork state to be stored by each subsystem separately. Also add a tagging system for cgroup_subsys.h to allow for CGROUP_ enumerations to be be defined and used. In addition, explicitly add a CGROUP_CANFORK_COUNT macro to make arrays easier to define. This is in preparation for implementing the pids cgroup subsystem. Signed-off-by: Aleksa Sarai Signed-off-by: Tejun Heo --- include/linux/cgroup_subsys.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'include/linux/cgroup_subsys.h') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb14403..ec43bce7e1ea 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -3,6 +3,17 @@ * * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ + +/* + * This file *must* be included with SUBSYS() defined. + * SUBSYS_TAG() is a noop if undefined. + */ + +#ifndef SUBSYS_TAG +#define __TMP_SUBSYS_TAG +#define SUBSYS_TAG(_x) +#endif + #if IS_ENABLED(CONFIG_CPUSETS) SUBSYS(cpuset) #endif @@ -47,12 +58,24 @@ SUBSYS(net_prio) SUBSYS(hugetlb) #endif +/* + * Subsystems that implement the can_fork() family of callbacks. + */ +SUBSYS_TAG(CANFORK_START) +SUBSYS_TAG(CANFORK_END) + /* * The following subsystems are not supported on the default hierarchy. */ #if IS_ENABLED(CONFIG_CGROUP_DEBUG) SUBSYS(debug) #endif + +#ifdef __TMP_SUBSYS_TAG +#undef __TMP_SUBSYS_TAG +#undef SUBSYS_TAG +#endif + /* * DO NOT ADD ANY SUBSYSTEM WITHOUT EXPLICIT ACKS FROM CGROUP MAINTAINERS. */ -- cgit v1.3-6-gb490 From 49b786ea146f69c371df18e81ce0a2d5839f865c Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 9 Jun 2015 21:32:10 +1000 Subject: cgroup: implement the PIDs subsystem Adds a new single-purpose PIDs subsystem to limit the number of tasks that can be forked inside a cgroup. Essentially this is an implementation of RLIMIT_NPROC that applies to a cgroup rather than a process tree. However, it should be noted that organisational operations (adding and removing tasks from a PIDs hierarchy) will *not* be prevented. Rather, the number of tasks in the hierarchy cannot exceed the limit through forking. This is due to the fact that, in the unified hierarchy, attach cannot fail (and it is not possible for a task to overcome its PIDs cgroup policy limit by attaching to a child cgroup -- even if migrating mid-fork it must be able to fork in the parent first). PIDs are fundamentally a global resource, and it is possible to reach PID exhaustion inside a cgroup without hitting any reasonable kmemcg policy. Once you've hit PID exhaustion, you're only in a marginally better state than OOM. This subsystem allows PID exhaustion inside a cgroup to be prevented. Signed-off-by: Aleksa Sarai Signed-off-by: Tejun Heo --- CREDITS | 5 + include/linux/cgroup_subsys.h | 5 + init/Kconfig | 16 ++ kernel/Makefile | 1 + kernel/cgroup_pids.c | 366 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 393 insertions(+) create mode 100644 kernel/cgroup_pids.c (limited to 'include/linux/cgroup_subsys.h') diff --git a/CREDITS b/CREDITS index 1d616640bbf6..4fcf9cd8544c 100644 --- a/CREDITS +++ b/CREDITS @@ -3219,6 +3219,11 @@ S: 69 rue Dunois S: 75013 Paris S: France +N: Aleksa Sarai +E: cyphar@cyphar.com +W: https://www.cyphar.com/ +D: `pids` cgroup subsystem + N: Dipankar Sarma E: dipankar@in.ibm.com D: RCU diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index ec43bce7e1ea..1f36945fd23d 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -62,6 +62,11 @@ SUBSYS(hugetlb) * Subsystems that implement the can_fork() family of callbacks. */ SUBSYS_TAG(CANFORK_START) + +#if IS_ENABLED(CONFIG_CGROUP_PIDS) +SUBSYS(pids) +#endif + SUBSYS_TAG(CANFORK_END) /* diff --git a/init/Kconfig b/init/Kconfig index af09b4fb43d2..2184b34cbf73 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -955,6 +955,22 @@ config CGROUP_FREEZER Provides a way to freeze and unfreeze all tasks in a cgroup. +config CGROUP_PIDS + bool "PIDs cgroup subsystem" + help + Provides enforcement of process number limits in the scope of a + cgroup. Any attempt to fork more processes than is allowed in the + cgroup will fail. PIDs are fundamentally a global resource because it + is fairly trivial to reach PID exhaustion before you reach even a + conservative kmemcg limit. As a result, it is possible to grind a + system to halt without being limited by other cgroup policies. The + PIDs cgroup subsystem is designed to stop this from happening. + + It should be noted that organisational operations (such as attaching + to a cgroup hierarchy will *not* be blocked by the PIDs subsystem), + since the PIDs limit only affects a process's ability to fork, not to + attach to a cgroup. + config CGROUP_DEVICE bool "Device controller for cgroups" help diff --git a/kernel/Makefile b/kernel/Makefile index 43c4c920f30a..718fb8afab7a 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -55,6 +55,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_CGROUPS) += cgroup.o obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o +obj-$(CONFIG_CGROUP_PIDS) += cgroup_pids.o obj-$(CONFIG_CPUSETS) += cpuset.o obj-$(CONFIG_UTS_NS) += utsname.o obj-$(CONFIG_USER_NS) += user_namespace.o diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c new file mode 100644 index 000000000000..d75488824ae2 --- /dev/null +++ b/kernel/cgroup_pids.c @@ -0,0 +1,366 @@ +/* + * Process number limiting controller for cgroups. + * + * Used to allow a cgroup hierarchy to stop any new processes from fork()ing + * after a certain limit is reached. + * + * Since it is trivial to hit the task limit without hitting any kmemcg limits + * in place, PIDs are a fundamental resource. As such, PID exhaustion must be + * preventable in the scope of a cgroup hierarchy by allowing resource limiting + * of the number of tasks in a cgroup. + * + * In order to use the `pids` controller, set the maximum number of tasks in + * pids.max (this is not available in the root cgroup for obvious reasons). The + * number of processes currently in the cgroup is given by pids.current. + * Organisational operations are not blocked by cgroup policies, so it is + * possible to have pids.current > pids.max. However, it is not possible to + * violate a cgroup policy through fork(). fork() will return -EAGAIN if forking + * would cause a cgroup policy to be violated. + * + * To set a cgroup to have no limit, set pids.max to "max". This is the default + * for all new cgroups (N.B. that PID limits are hierarchical, so the most + * stringent limit in the hierarchy is followed). + * + * pids.current tracks all child cgroup hierarchies, so parent/pids.current is + * a superset of parent/child/pids.current. + * + * Copyright (C) 2015 Aleksa Sarai + * + * This file is subject to the terms and conditions of version 2 of the GNU + * General Public License. See the file COPYING in the main directory of the + * Linux distribution for more details. + */ + +#include +#include +#include +#include +#include + +#define PIDS_MAX (PID_MAX_LIMIT + 1ULL) +#define PIDS_MAX_STR "max" + +struct pids_cgroup { + struct cgroup_subsys_state css; + + /* + * Use 64-bit types so that we can safely represent "max" as + * %PIDS_MAX = (%PID_MAX_LIMIT + 1). + */ + atomic64_t counter; + int64_t limit; +}; + +static struct pids_cgroup *css_pids(struct cgroup_subsys_state *css) +{ + return container_of(css, struct pids_cgroup, css); +} + +static struct pids_cgroup *parent_pids(struct pids_cgroup *pids) +{ + return css_pids(pids->css.parent); +} + +static struct cgroup_subsys_state * +pids_css_alloc(struct cgroup_subsys_state *parent) +{ + struct pids_cgroup *pids; + + pids = kzalloc(sizeof(struct pids_cgroup), GFP_KERNEL); + if (!pids) + return ERR_PTR(-ENOMEM); + + pids->limit = PIDS_MAX; + atomic64_set(&pids->counter, 0); + return &pids->css; +} + +static void pids_css_free(struct cgroup_subsys_state *css) +{ + kfree(css_pids(css)); +} + +/** + * pids_cancel - uncharge the local pid count + * @pids: the pid cgroup state + * @num: the number of pids to cancel + * + * This function will WARN if the pid count goes under 0, because such a case is + * a bug in the pids controller proper. + */ +static void pids_cancel(struct pids_cgroup *pids, int num) +{ + /* + * A negative count (or overflow for that matter) is invalid, + * and indicates a bug in the `pids` controller proper. + */ + WARN_ON_ONCE(atomic64_add_negative(-num, &pids->counter)); +} + +/** + * pids_uncharge - hierarchically uncharge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to uncharge + */ +static void pids_uncharge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; p; p = parent_pids(p)) + pids_cancel(p, num); +} + +/** + * pids_charge - hierarchically charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function does *not* follow the pid limit set. It cannot fail and the new + * pid count may exceed the limit. This is only used for reverting failed + * attaches, where there is no other way out than violating the limit. + */ +static void pids_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p; + + for (p = pids; p; p = parent_pids(p)) + atomic64_add(num, &p->counter); +} + +/** + * pids_try_charge - hierarchically try to charge the pid count + * @pids: the pid cgroup state + * @num: the number of pids to charge + * + * This function follows the set limit. It will fail if the charge would cause + * the new value to exceed the hierarchical limit. Returns 0 if the charge + * succeded, otherwise -EAGAIN. + */ +static int pids_try_charge(struct pids_cgroup *pids, int num) +{ + struct pids_cgroup *p, *q; + + for (p = pids; p; p = parent_pids(p)) { + int64_t new = atomic64_add_return(num, &p->counter); + + /* + * Since new is capped to the maximum number of pid_t, if + * p->limit is %PIDS_MAX then we know that this test will never + * fail. + */ + if (new > p->limit) + goto revert; + } + + return 0; + +revert: + for (q = pids; q != p; q = parent_pids(q)) + pids_cancel(q, num); + pids_cancel(p, num); + + return -EAGAIN; +} + +static int pids_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct pids_cgroup *pids = css_pids(css); + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) { + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + /* + * Grab a ref to each task's css. We don't drop the ref until + * we either fail and hit ->cancel_attach() or succeed and hit + * ->attach(). + */ + old_css = task_get_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(pids, 1); + pids_uncharge(old_pids, 1); + } + + return 0; +} + +static void pids_cancel_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct pids_cgroup *pids = css_pids(css); + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) { + struct cgroup_subsys_state *old_css; + struct pids_cgroup *old_pids; + + old_css = task_css(task, pids_cgrp_id); + old_pids = css_pids(old_css); + + pids_charge(old_pids, 1); + pids_uncharge(pids, 1); + css_put(old_css); + } +} + +static void pids_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) +{ + struct task_struct *task; + + cgroup_taskset_for_each(task, tset) + css_put(task_css(task, pids_cgrp_id)); +} + +static int pids_can_fork(struct task_struct *task, void **priv_p) +{ + struct cgroup_subsys_state *css; + struct pids_cgroup *pids; + int err; + + /* + * Use the "current" task_css for the pids subsystem as the tentative + * css. It is possible we will charge the wrong hierarchy, in which + * case we will forcefully revert/reapply the charge on the right + * hierarchy after it is committed to the task proper. + */ + css = task_get_css(current, pids_cgrp_id); + pids = css_pids(css); + + err = pids_try_charge(pids, 1); + if (err) + goto err_css_put; + + *priv_p = css; + return 0; + +err_css_put: + css_put(css); + return err; +} + +static void pids_cancel_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css = priv; + struct pids_cgroup *pids = css_pids(css); + + pids_uncharge(pids, 1); + css_put(css); +} + +static void pids_fork(struct task_struct *task, void *priv) +{ + struct cgroup_subsys_state *css; + struct cgroup_subsys_state *old_css = priv; + struct pids_cgroup *pids; + struct pids_cgroup *old_pids = css_pids(old_css); + + css = task_get_css(task, pids_cgrp_id); + pids = css_pids(css); + + /* + * If the association has changed, we have to revert and reapply the + * charge/uncharge on the wrong hierarchy to the current one. Since + * the association can only change due to an organisation event, its + * okay for us to ignore the limit in this case. + */ + if (pids != old_pids) { + pids_uncharge(old_pids, 1); + pids_charge(pids, 1); + } + + css_put(css); + css_put(old_css); +} + +static void pids_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) +{ + struct pids_cgroup *pids = css_pids(old_css); + + pids_uncharge(pids, 1); +} + +static ssize_t pids_max_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct cgroup_subsys_state *css = of_css(of); + struct pids_cgroup *pids = css_pids(css); + int64_t limit; + int err; + + buf = strstrip(buf); + if (!strcmp(buf, PIDS_MAX_STR)) { + limit = PIDS_MAX; + goto set_limit; + } + + err = kstrtoll(buf, 0, &limit); + if (err) + return err; + + if (limit < 0 || limit >= PIDS_MAX) + return -EINVAL; + +set_limit: + /* + * Limit updates don't need to be mutex'd, since it isn't + * critical that any racing fork()s follow the new limit. + */ + pids->limit = limit; + return nbytes; +} + +static int pids_max_show(struct seq_file *sf, void *v) +{ + struct cgroup_subsys_state *css = seq_css(sf); + struct pids_cgroup *pids = css_pids(css); + int64_t limit = pids->limit; + + if (limit >= PIDS_MAX) + seq_printf(sf, "%s\n", PIDS_MAX_STR); + else + seq_printf(sf, "%lld\n", limit); + + return 0; +} + +static s64 pids_current_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct pids_cgroup *pids = css_pids(css); + + return atomic64_read(&pids->counter); +} + +static struct cftype pids_files[] = { + { + .name = "max", + .write = pids_max_write, + .seq_show = pids_max_show, + .flags = CFTYPE_NOT_ON_ROOT, + }, + { + .name = "current", + .read_s64 = pids_current_read, + }, + { } /* terminate */ +}; + +struct cgroup_subsys pids_cgrp_subsys = { + .css_alloc = pids_css_alloc, + .css_free = pids_css_free, + .attach = pids_attach, + .can_attach = pids_can_attach, + .cancel_attach = pids_cancel_attach, + .can_fork = pids_can_fork, + .cancel_fork = pids_cancel_fork, + .fork = pids_fork, + .exit = pids_exit, + .legacy_cftypes = pids_files, + .dfl_cftypes = pids_files, +}; -- cgit v1.3-6-gb490 From c165b3e3c7bb68c2ed55a5ac2623f030d01d9567 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 18 Aug 2015 14:55:29 -0700 Subject: blkcg: rename subsystem name from blkio to io blkio interface has become messy over time and is currently the largest. In addition to the inconsistent naming scheme, it has multiple stat files which report more or less the same thing, a number of debug stat files which expose internal details which shouldn't have been part of the public interface in the first place, recursive and non-recursive stats and leaf and non-leaf knobs. Both recursive vs. non-recursive and leaf vs. non-leaf distinctions don't make any sense on the unified hierarchy as only leaf cgroups can contain processes. cgroups is going through a major interface revision with the unified hierarchy involving significant fundamental usage changes and given that a significant portion of the interface doesn't make sense anymore, it's a good time to reorganize the interface. As the first step, this patch renames the external visible subsystem name from "blkio" to "io". This is more concise, matches the other two major subsystem names, "cpu" and "memory", and better suited as blkcg will be involved in anything writeback related too whether an actual block device is involved or not. As the subsystem legacy_name is set to "blkio", the only userland visible change outside the unified hierarchy is that blkcg is reported as "io" instead of "blkio" in the subsystem initialized message during boot. On the unified hierarchy, blkcg now appears as "io". Signed-off-by: Tejun Heo Cc: Li Zefan Cc: Johannes Weiner Cc: cgroups@vger.kernel.org Signed-off-by: Jens Axboe --- block/bio.c | 2 +- block/blk-cgroup.c | 7 ++++--- include/linux/backing-dev.h | 2 +- include/linux/blk-cgroup.h | 4 ++-- include/linux/cgroup_subsys.h | 2 +- mm/backing-dev.c | 4 ++-- 6 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux/cgroup_subsys.h') diff --git a/block/bio.c b/block/bio.c index d6e5ba3399f0..c52222c6c69b 100644 --- a/block/bio.c +++ b/block/bio.c @@ -2046,7 +2046,7 @@ int bio_associate_current(struct bio *bio) get_io_context_active(ioc); bio->bi_ioc = ioc; - bio->bi_css = task_get_css(current, blkio_cgrp_id); + bio->bi_css = task_get_css(current, io_cgrp_id); return 0; } EXPORT_SYMBOL_GPL(bio_associate_current); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0af3bff198ed..fc197ea4c992 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1089,12 +1089,13 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, return ret; } -struct cgroup_subsys blkio_cgrp_subsys = { +struct cgroup_subsys io_cgrp_subsys = { .css_alloc = blkcg_css_alloc, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, .can_attach = blkcg_can_attach, .legacy_cftypes = blkcg_files, + .legacy_name = "blkio", #ifdef CONFIG_MEMCG /* * This ensures that, if available, memcg is automatically enabled @@ -1104,7 +1105,7 @@ struct cgroup_subsys blkio_cgrp_subsys = { .depends_on = 1 << memory_cgrp_id, #endif }; -EXPORT_SYMBOL_GPL(blkio_cgrp_subsys); +EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** * blkcg_activate_policy - activate a blkcg policy on a request_queue @@ -1266,7 +1267,7 @@ int blkcg_policy_register(struct blkcg_policy *pol) /* everything is in place, add intf files for the new policy */ if (pol->cftypes) - WARN_ON(cgroup_add_legacy_cftypes(&blkio_cgrp_subsys, + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, pol->cftypes)); mutex_unlock(&blkcg_pol_register_mutex); return 0; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 23ebb946e66f..5a5d79ee256f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi * %current's blkcg equals the effective blkcg of its memcg. No * need to use the relatively expensive cgroup_get_e_css(). */ - if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id))) + if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id))) return wb; return NULL; } diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 286e1bde249f..db89acd2a864 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -221,7 +221,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return css_to_blkcg(task_css(tsk, blkio_cgrp_id)); + return css_to_blkcg(task_css(tsk, io_cgrp_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) @@ -234,7 +234,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) static inline struct cgroup_subsys_state * task_get_blkcg_css(struct task_struct *task) { - return task_get_css(task, blkio_cgrp_id); + return task_get_css(task, io_cgrp_id); } /** diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index e4a96fb14403..86b5056104df 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -16,7 +16,7 @@ SUBSYS(cpuacct) #endif #if IS_ENABLED(CONFIG_BLK_CGROUP) -SUBSYS(blkio) +SUBSYS(io) #endif #if IS_ENABLED(CONFIG_MEMCG) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..d0ee90e72867 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -523,7 +523,7 @@ static int cgwb_create(struct backing_dev_info *bdi, int ret = 0; memcg = mem_cgroup_from_css(memcg_css); - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &blkio_cgrp_subsys); + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); blkcg_cgwb_list = &blkcg->cgwb_list; @@ -645,7 +645,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &blkio_cgrp_subsys); + &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; -- cgit v1.3-6-gb490