From 18fa84a2db0e15b02baa5d94bdb5bd509175d2f6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 29 May 2019 13:46:25 -0700 Subject: cgroup: Use css_tryget() instead of css_tryget_online() in task_get_css() A PF_EXITING task can stay associated with an offline css. If such task calls task_get_css(), it can get stuck indefinitely. This can be triggered by BSD process accounting which writes to a file with PF_EXITING set when racing against memcg disable as in the backtrace at the end. After this change, task_get_css() may return a css which was already offline when the function was called. None of the existing users are affected by this change. INFO: rcu_sched self-detected stall on CPU INFO: rcu_sched detected stalls on CPUs/tasks: ... NMI backtrace for cpu 0 ... Call Trace: dump_stack+0x46/0x68 nmi_cpu_backtrace.cold.2+0x13/0x57 nmi_trigger_cpumask_backtrace+0xba/0xca rcu_dump_cpu_stacks+0x9e/0xce rcu_check_callbacks.cold.74+0x2af/0x433 update_process_times+0x28/0x60 tick_sched_timer+0x34/0x70 __hrtimer_run_queues+0xee/0x250 hrtimer_interrupt+0xf4/0x210 smp_apic_timer_interrupt+0x56/0x110 apic_timer_interrupt+0xf/0x20 RIP: 0010:balance_dirty_pages_ratelimited+0x28f/0x3d0 ... btrfs_file_write_iter+0x31b/0x563 __vfs_write+0xfa/0x140 __kernel_write+0x4f/0x100 do_acct_process+0x495/0x580 acct_process+0xb9/0xdb do_exit+0x748/0xa00 do_group_exit+0x3a/0xa0 get_signal+0x254/0x560 do_signal+0x23/0x5c0 exit_to_usermode_loop+0x5d/0xa0 prepare_exit_to_usermode+0x53/0x80 retint_user+0x8/0x8 Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org # v4.2+ Fixes: ec438699a9ae ("cgroup, block: implement task_get_css() and use it in bio_associate_current()") --- include/linux/cgroup.h | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c0077adeea83..a7e4611e20c8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -487,7 +487,7 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a - * valid css. + * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) @@ -497,7 +497,13 @@ task_get_css(struct task_struct *task, int subsys_id) rcu_read_lock(); while (true) { css = task_css(task, subsys_id); - if (likely(css_tryget_online(css))) + /* + * Can't use css_tryget_online() here. A task which has + * PF_EXITING set may stay associated with an offline css. + * If such task calls this function, css_tryget_online() + * will keep failing. + */ + if (likely(css_tryget(css))) break; cpu_relax(); } -- cgit v1.2.3-59-g8ed1b From b636fd38dc40113f853337a7d2a6885ad23b8811 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Implement css_task_iter_skip() When a task is moved out of a cset, task iterators pointing to the task are advanced using the normal css_task_iter_advance() call. This is fine but we'll be tracking dying tasks on csets and thus moving tasks from cset->tasks to (to be added) cset->dying_tasks. When we remove a task from cset->tasks, if we advance the iterators, they may move over to the next cset before we had the chance to add the task back on the dying list, which can allow the task to escape iteration. This patch separates out skipping from advancing. Skipping only moves the affected iterators to the next pointer rather than fully advancing it and the following advancing will recognize that the cursor has already been moved forward and do the rest of advancing. This ensures that when a task moves from one list to another in its cset, as long as it moves in the right direction, it's always visible to iteration. This doesn't cause any visible behavior changes. Signed-off-by: Tejun Heo Cc: Oleg Nesterov --- include/linux/cgroup.h | 3 +++ kernel/cgroup/cgroup.c | 60 ++++++++++++++++++++++++++++++-------------------- 2 files changed, 39 insertions(+), 24 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index a7e4611e20c8..05ed2a209e74 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -43,6 +43,9 @@ /* walk all threaded css_sets in the domain */ #define CSS_TASK_ITER_THREADED (1U << 1) +/* internal flags */ +#define CSS_TASK_ITER_SKIPPED (1U << 16) + /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 217cec4e22c6..035aee466bbf 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -215,7 +215,8 @@ static struct cftype cgroup_base_files[]; static int cgroup_apply_control(struct cgroup *cgrp); static void cgroup_finalize_control(struct cgroup *cgrp, int ret); -static void css_task_iter_advance(struct css_task_iter *it); +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task); static int cgroup_destroy_locked(struct cgroup *cgrp); static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, struct cgroup_subsys *ss); @@ -843,6 +844,21 @@ static void css_set_update_populated(struct css_set *cset, bool populated) cgroup_update_populated(link->cgrp, populated); } +/* + * @task is leaving, advance task iterators which are pointing to it so + * that they can resume at the next position. Advancing an iterator might + * remove it from the list, use safe walk. See css_task_iter_skip() for + * details. + */ +static void css_set_skip_task_iters(struct css_set *cset, + struct task_struct *task) +{ + struct css_task_iter *it, *pos; + + list_for_each_entry_safe(it, pos, &cset->task_iters, iters_node) + css_task_iter_skip(it, task); +} + /** * css_set_move_task - move a task from one css_set to another * @task: task being moved @@ -868,22 +884,9 @@ static void css_set_move_task(struct task_struct *task, css_set_update_populated(to_cset, true); if (from_cset) { - struct css_task_iter *it, *pos; - WARN_ON_ONCE(list_empty(&task->cg_list)); - /* - * @task is leaving, advance task iterators which are - * pointing to it so that they can resume at the next - * position. Advancing an iterator might remove it from - * the list, use safe walk. See css_task_iter_advance*() - * for details. - */ - list_for_each_entry_safe(it, pos, &from_cset->task_iters, - iters_node) - if (it->task_pos == &task->cg_list) - css_task_iter_advance(it); - + css_set_skip_task_iters(from_cset, task); list_del_init(&task->cg_list); if (!css_set_populated(from_cset)) css_set_update_populated(from_cset, false); @@ -4430,10 +4433,19 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) list_add(&it->iters_node, &cset->task_iters); } -static void css_task_iter_advance(struct css_task_iter *it) +static void css_task_iter_skip(struct css_task_iter *it, + struct task_struct *task) { - struct list_head *next; + lockdep_assert_held(&css_set_lock); + + if (it->task_pos == &task->cg_list) { + it->task_pos = it->task_pos->next; + it->flags |= CSS_TASK_ITER_SKIPPED; + } +} +static void css_task_iter_advance(struct css_task_iter *it) +{ lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4442,15 +4454,15 @@ repeat: * consumed first and then ->mg_tasks. After ->mg_tasks, * we move onto the next cset. */ - next = it->task_pos->next; - - if (next == it->tasks_head) - next = it->mg_tasks_head->next; + if (it->flags & CSS_TASK_ITER_SKIPPED) + it->flags &= ~CSS_TASK_ITER_SKIPPED; + else + it->task_pos = it->task_pos->next; - if (next == it->mg_tasks_head) + if (it->task_pos == it->tasks_head) + it->task_pos = it->mg_tasks_head->next; + if (it->task_pos == it->mg_tasks_head) css_task_iter_advance_css_set(it); - else - it->task_pos = next; } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); -- cgit v1.2.3-59-g8ed1b From c03cd7738a83b13739f00546166969342c8ff014 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 31 May 2019 10:38:58 -0700 Subject: cgroup: Include dying leaders with live threads in PROCS iterations CSS_TASK_ITER_PROCS currently iterates live group leaders; however, this means that a process with dying leader and live threads will be skipped. IOW, cgroup.procs might be empty while cgroup.threads isn't, which is confusing to say the least. Fix it by making cset track dying tasks and include dying leaders with live threads in PROCS iteration. Signed-off-by: Tejun Heo Reported-and-tested-by: Topi Miettinen Cc: Oleg Nesterov --- include/linux/cgroup-defs.h | 1 + include/linux/cgroup.h | 1 + kernel/cgroup/cgroup.c | 44 +++++++++++++++++++++++++++++++++++++------- 3 files changed, 39 insertions(+), 7 deletions(-) (limited to 'include/linux/cgroup.h') diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 77258d276f93..1615b9c17e02 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -216,6 +216,7 @@ struct css_set { */ struct list_head tasks; struct list_head mg_tasks; + struct list_head dying_tasks; /* all css_task_iters currently walking this cset */ struct list_head task_iters; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 05ed2a209e74..0297f930a56e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -60,6 +60,7 @@ struct css_task_iter { struct list_head *task_pos; struct list_head *tasks_head; struct list_head *mg_tasks_head; + struct list_head *dying_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 035aee466bbf..a7df319c2e9a 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -739,6 +739,7 @@ struct css_set init_css_set = { .dom_cset = &init_css_set, .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), + .dying_tasks = LIST_HEAD_INIT(init_css_set.dying_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), .threaded_csets = LIST_HEAD_INIT(init_css_set.threaded_csets), .cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links), @@ -1213,6 +1214,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, cset->dom_cset = cset; INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); + INIT_LIST_HEAD(&cset->dying_tasks); INIT_LIST_HEAD(&cset->task_iters); INIT_LIST_HEAD(&cset->threaded_csets); INIT_HLIST_NODE(&cset->hlist); @@ -4399,15 +4401,18 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it) it->task_pos = NULL; return; } - } while (!css_set_populated(cset)); + } while (!css_set_populated(cset) && !list_empty(&cset->dying_tasks)); if (!list_empty(&cset->tasks)) it->task_pos = cset->tasks.next; - else + else if (!list_empty(&cset->mg_tasks)) it->task_pos = cset->mg_tasks.next; + else + it->task_pos = cset->dying_tasks.next; it->tasks_head = &cset->tasks; it->mg_tasks_head = &cset->mg_tasks; + it->dying_tasks_head = &cset->dying_tasks; /* * We don't keep css_sets locked across iteration steps and thus @@ -4446,6 +4451,8 @@ static void css_task_iter_skip(struct css_task_iter *it, static void css_task_iter_advance(struct css_task_iter *it) { + struct task_struct *task; + lockdep_assert_held(&css_set_lock); repeat: if (it->task_pos) { @@ -4462,17 +4469,32 @@ repeat: if (it->task_pos == it->tasks_head) it->task_pos = it->mg_tasks_head->next; if (it->task_pos == it->mg_tasks_head) + it->task_pos = it->dying_tasks_head->next; + if (it->task_pos == it->dying_tasks_head) css_task_iter_advance_css_set(it); } else { /* called from start, proceed to the first cset */ css_task_iter_advance_css_set(it); } - /* if PROCS, skip over tasks which aren't group leaders */ - if ((it->flags & CSS_TASK_ITER_PROCS) && it->task_pos && - !thread_group_leader(list_entry(it->task_pos, struct task_struct, - cg_list))) - goto repeat; + if (!it->task_pos) + return; + + task = list_entry(it->task_pos, struct task_struct, cg_list); + + if (it->flags & CSS_TASK_ITER_PROCS) { + /* if PROCS, skip over tasks which aren't group leaders */ + if (!thread_group_leader(task)) + goto repeat; + + /* and dying leaders w/o live member threads */ + if (!atomic_read(&task->signal->live)) + goto repeat; + } else { + /* skip all dying ones */ + if (task->flags & PF_EXITING) + goto repeat; + } } /** @@ -6009,6 +6031,7 @@ void cgroup_exit(struct task_struct *tsk) if (!list_empty(&tsk->cg_list)) { spin_lock_irq(&css_set_lock); css_set_move_task(tsk, cset, NULL, false); + list_add_tail(&tsk->cg_list, &cset->dying_tasks); cset->nr_tasks--; WARN_ON_ONCE(cgroup_task_frozen(tsk)); @@ -6034,6 +6057,13 @@ void cgroup_release(struct task_struct *task) do_each_subsys_mask(ss, ssid, have_release_callback) { ss->release(task); } while_each_subsys_mask(); + + if (use_task_css_set_links) { + spin_lock_irq(&css_set_lock); + css_set_skip_task_iters(task_css_set(task), task); + list_del_init(&task->cg_list); + spin_unlock_irq(&css_set_lock); + } } void cgroup_free(struct task_struct *task) -- cgit v1.2.3-59-g8ed1b