From 67516844625f45f0ce148a01c27bf41f591872b2 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 9 Jul 2013 18:56:31 +0200 Subject: perf: Remove the 'match' callback for auxiliary events processing It gives the following benefits: - only one function pointer is passed along the way - the 'match' function is called within output function and could be inlined by the compiler Suggested-by: Peter Zijlstra Signed-off-by: Jiri Olsa Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373388991-9711-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 79 ++++++++++++++++++++++++++-------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index eba8fb5834ae..708ab70ca442 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4680,12 +4680,10 @@ perf_event_read_event(struct perf_event *event, perf_output_end(&handle); } -typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data); typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data); static void perf_event_aux_ctx(struct perf_event_context *ctx, - perf_event_aux_match_cb match, perf_event_aux_output_cb output, void *data) { @@ -4696,15 +4694,12 @@ perf_event_aux_ctx(struct perf_event_context *ctx, continue; if (!event_filter_match(event)) continue; - if (match(event, data)) - output(event, data); + output(event, data); } } static void -perf_event_aux(perf_event_aux_match_cb match, - perf_event_aux_output_cb output, - void *data, +perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { struct perf_cpu_context *cpuctx; @@ -4717,7 +4712,7 @@ perf_event_aux(perf_event_aux_match_cb match, cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; - perf_event_aux_ctx(&cpuctx->ctx, match, output, data); + perf_event_aux_ctx(&cpuctx->ctx, output, data); if (task_ctx) goto next; ctxn = pmu->task_ctx_nr; @@ -4725,14 +4720,14 @@ perf_event_aux(perf_event_aux_match_cb match, goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) - perf_event_aux_ctx(ctx, match, output, data); + perf_event_aux_ctx(ctx, output, data); next: put_cpu_ptr(pmu->pmu_cpu_context); } if (task_ctx) { preempt_disable(); - perf_event_aux_ctx(task_ctx, match, output, data); + perf_event_aux_ctx(task_ctx, output, data); preempt_enable(); } rcu_read_unlock(); @@ -4759,6 +4754,12 @@ struct perf_task_event { } event_id; }; +static int perf_event_task_match(struct perf_event *event) +{ + return event->attr.comm || event->attr.mmap || + event->attr.mmap_data || event->attr.task; +} + static void perf_event_task_output(struct perf_event *event, void *data) { @@ -4768,6 +4769,9 @@ static void perf_event_task_output(struct perf_event *event, struct task_struct *task = task_event->task; int ret, size = task_event->event_id.header.size; + if (!perf_event_task_match(event)) + return; + perf_event_header__init_id(&task_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, @@ -4790,13 +4794,6 @@ out: task_event->event_id.header.size = size; } -static int perf_event_task_match(struct perf_event *event, - void *data __maybe_unused) -{ - return event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task; -} - static void perf_event_task(struct task_struct *task, struct perf_event_context *task_ctx, int new) @@ -4825,8 +4822,7 @@ static void perf_event_task(struct task_struct *task, }, }; - perf_event_aux(perf_event_task_match, - perf_event_task_output, + perf_event_aux(perf_event_task_output, &task_event, task_ctx); } @@ -4853,6 +4849,11 @@ struct perf_comm_event { } event_id; }; +static int perf_event_comm_match(struct perf_event *event) +{ + return event->attr.comm; +} + static void perf_event_comm_output(struct perf_event *event, void *data) { @@ -4862,6 +4863,9 @@ static void perf_event_comm_output(struct perf_event *event, int size = comm_event->event_id.header.size; int ret; + if (!perf_event_comm_match(event)) + return; + perf_event_header__init_id(&comm_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, comm_event->event_id.header.size); @@ -4883,12 +4887,6 @@ out: comm_event->event_id.header.size = size; } -static int perf_event_comm_match(struct perf_event *event, - void *data __maybe_unused) -{ - return event->attr.comm; -} - static void perf_event_comm_event(struct perf_comm_event *comm_event) { char comm[TASK_COMM_LEN]; @@ -4903,8 +4901,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - perf_event_aux(perf_event_comm_match, - perf_event_comm_output, + perf_event_aux(perf_event_comm_output, comm_event, NULL); } @@ -4967,6 +4964,17 @@ struct perf_mmap_event { } event_id; }; +static int perf_event_mmap_match(struct perf_event *event, + void *data) +{ + struct perf_mmap_event *mmap_event = data; + struct vm_area_struct *vma = mmap_event->vma; + int executable = vma->vm_flags & VM_EXEC; + + return (!executable && event->attr.mmap_data) || + (executable && event->attr.mmap); +} + static void perf_event_mmap_output(struct perf_event *event, void *data) { @@ -4976,6 +4984,9 @@ static void perf_event_mmap_output(struct perf_event *event, int size = mmap_event->event_id.header.size; int ret; + if (!perf_event_mmap_match(event, data)) + return; + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, mmap_event->event_id.header.size); @@ -4996,17 +5007,6 @@ out: mmap_event->event_id.header.size = size; } -static int perf_event_mmap_match(struct perf_event *event, - void *data) -{ - struct perf_mmap_event *mmap_event = data; - struct vm_area_struct *vma = mmap_event->vma; - int executable = vma->vm_flags & VM_EXEC; - - return (!executable && event->attr.mmap_data) || - (executable && event->attr.mmap); -} - static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; @@ -5070,8 +5070,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - perf_event_aux(perf_event_mmap_match, - perf_event_mmap_output, + perf_event_aux(perf_event_mmap_output, mmap_event, NULL); -- cgit v1.3-8-gc7d7 From c4be9cb4f19cbd534a6c4c334cd48d8bb483e17a Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 8 Jul 2013 14:23:51 -0700 Subject: lglock: Update lockdep annotations to report recursive local locks Oleg Nesterov recently noticed that the lockdep annotations in lglock.c are not sufficient to detect some obvious deadlocks, such as lg_local_lock(LOCK) + lg_local_lock(LOCK) or spin_lock(X) + lg_local_lock(Y) vs lg_local_lock(Y) + spin_lock(X). Both issues are easily fixed by indicating to lockdep that lglock's local locks are not recursive. We shouldn't use the rwlock acquire/release functions here, as lglock doesn't share the same semantics. Instead we can base our lockdep annotations on the lock_acquire_shared (for local lglock) and lock_acquire_exclusive (for global lglock) helpers. I am not proposing new lglock specific helpers as I don't see the point of the existing second level of helpers :) Noticed-by: Oleg Nesterov Signed-off-by: Michel Lespinasse Cc: Lai Jiangshan Cc: "Srivatsa S. Bhat" Cc: Rusty Russell Cc: Andi Kleen Cc: "Paul E. McKenney" Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130708212352.1769031C15E@corp2gmr1-1.hot.corp.google.com Signed-off-by: Ingo Molnar --- kernel/lglock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/lglock.c b/kernel/lglock.c index 6535a667a5a7..86ae2aebf004 100644 --- a/kernel/lglock.c +++ b/kernel/lglock.c @@ -21,7 +21,7 @@ void lg_local_lock(struct lglock *lg) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_lock(lock); } @@ -31,7 +31,7 @@ void lg_local_unlock(struct lglock *lg) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = this_cpu_ptr(lg->lock); arch_spin_unlock(lock); preempt_enable(); @@ -43,7 +43,7 @@ void lg_local_lock_cpu(struct lglock *lg, int cpu) arch_spinlock_t *lock; preempt_disable(); - rwlock_acquire_read(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_lock(lock); } @@ -53,7 +53,7 @@ void lg_local_unlock_cpu(struct lglock *lg, int cpu) { arch_spinlock_t *lock; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); lock = per_cpu_ptr(lg->lock, cpu); arch_spin_unlock(lock); preempt_enable(); @@ -65,7 +65,7 @@ void lg_global_lock(struct lglock *lg) int i; preempt_disable(); - rwlock_acquire(&lg->lock_dep_map, 0, 0, _RET_IP_); + lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); @@ -78,7 +78,7 @@ void lg_global_unlock(struct lglock *lg) { int i; - rwlock_release(&lg->lock_dep_map, 1, _RET_IP_); + lock_release(&lg->lock_dep_map, 1, _RET_IP_); for_each_possible_cpu(i) { arch_spinlock_t *lock; lock = per_cpu_ptr(lg->lock, i); -- cgit v1.3-8-gc7d7 From cedce3e730833d26a37826a96e1905b6ef387df9 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 4 Jul 2013 22:48:20 +0400 Subject: sched/__wake_up_sync_key(): Fix nr_exclusive tasks which lead to WF_SYNC clearing Only one task can replace the waker. Signed-off-by: Kirill Tkhai CC: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/512421372963700@web25f.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..f73787159188 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2660,7 +2660,7 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, if (unlikely(!q)) return; - if (unlikely(!nr_exclusive)) + if (unlikely(nr_exclusive != 1)) wake_flags = 0; spin_lock_irqsave(&q->lock, flags); -- cgit v1.3-8-gc7d7 From 8f89140ae41ccd9c63344e6823faa862aa7435e3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: minor updates around cgroup_clear_directory() * Rename it to cgroup_clear_dir() and make it take the pointer to the target cgroup instead of the the dentry. This makes the function consistent with its counterpart - cgroup_populate_dir(). * Move cgroup_clear_directory() invocation from cgroup_d_remove_dir() to cgroup_remount() so that the function doesn't have to determine the cgroup pointer back from the dentry. cgroup_d_remove_dir() now only deals with vfs, which is slightly cleaner. This patch doesn't introduce any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5583d10a325..09bfa870e698 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -957,15 +957,14 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_directory - selective removal of base and subsystem files - * @dir: directory containing the files + * cgroup_clear_dir - selective removal of base and subsystem files + * @cgrp: target cgroup * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_directory(struct dentry *dir, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, + unsigned long subsys_mask) { - struct cgroup *cgrp = __d_cgrp(dir); struct cgroup_subsys *ss; for_each_root_subsys(cgrp->root, ss) { @@ -987,9 +986,6 @@ static void cgroup_clear_directory(struct dentry *dir, bool base_files, static void cgroup_d_remove_dir(struct dentry *dentry) { struct dentry *parent; - struct cgroupfs_root *root = dentry->d_sb->s_fs_info; - - cgroup_clear_directory(dentry, true, root->subsys_mask); parent = dentry->d_parent; spin_lock(&parent->d_lock); @@ -1376,7 +1372,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_directory(cgrp->dentry, false, removed_mask); + cgroup_clear_dir(cgrp, false, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { @@ -4541,9 +4537,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) raw_spin_unlock(&release_list_lock); /* - * Remove @cgrp directory. The removal puts the base ref but we - * aren't quite done with @cgrp yet, so hold onto it. + * Clear and remove @cgrp directory. The removal puts the base ref + * but we aren't quite done with @cgrp yet, so hold onto it. */ + cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); dget(d); cgroup_d_remove_dir(d); -- cgit v1.3-8-gc7d7 From b1f28d3109349899e87377e89f9d8ab5bc95ec57 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:10 -0700 Subject: cgroup: fix error path of cgroup_addrm_files() cgroup_addrm_files() mishandled error return value from cgroup_add_file() and returns error iff the last file fails to create. As we're in the process of cleaning up file add/rm error handling and will reliably propagate file creation failures, there's no point in keeping adding files after a failure. Replace the broken error collection logic with immediate error return. While at it, add lockdep assertions and function comment. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 09bfa870e698..9b16d75bec63 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2780,11 +2780,26 @@ out: return error; } +/** + * cgroup_addrm_files - add or remove files to a cgroup directory + * @cgrp: the target cgroup + * @subsys: the subsystem of files to be added + * @cfts: array of cftypes to be added + * @is_add: whether to add or remove + * + * Depending on @is_add, add or remove files defined by @cfts on @cgrp. + * All @cfts should belong to @subsys. For removals, this function never + * fails. If addition fails, this function doesn't remove files already + * added. The caller is responsible for cleaning up. + */ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, struct cftype cfts[], bool is_add) { struct cftype *cft; - int err, ret = 0; + int ret; + + lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); + lockdep_assert_held(&cgroup_mutex); for (cft = cfts; cft->name[0] != '\0'; cft++) { /* does cft->flags tell us to skip this file on @cgrp? */ @@ -2796,16 +2811,17 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - err = cgroup_add_file(cgrp, subsys, cft); - if (err) + ret = cgroup_add_file(cgrp, subsys, cft); + if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", - cft->name, err); - ret = err; + cft->name, ret); + return ret; + } } else { cgroup_rm_file(cgrp, cft); } } - return ret; + return 0; } static void cgroup_cfts_prepare(void) -- cgit v1.3-8-gc7d7 From 9ccece80ae19ed42439fc0ced76858f189cd41e8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: fix cgroup_add_cftypes() error handling cgroup_add_cftypes() uses cgroup_cfts_commit() to actually create the files; however, both functions ignore actual file creation errors and just assume success. This can lead to, for example, blkio hierarchy with some of the cgroups with only subset of interface files populated after cfq-iosched is loaded under heavy memory pressure, which is nasty. This patch updates cgroup_cfts_commit() and cgroup_add_cftypes() to guarantee that all files are created on success and no file is created on failure. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9b16d75bec63..36c0ccc921f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2836,8 +2836,8 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static void cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cgroup_subsys *ss, + struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); @@ -2846,12 +2846,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, struct dentry *prev = NULL; struct inode *inode; u64 update_before; + int ret = 0; /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ if (!cfts || ss->root == &cgroup_dummy_root || !atomic_inc_not_zero(&sb->s_active)) { mutex_unlock(&cgroup_mutex); - return; + return 0; } /* @@ -2867,10 +2868,13 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); + if (ret) + goto out_deact; + /* add/rm files for all cgroups created before */ rcu_read_lock(); cgroup_for_each_descendant_pre(cgrp, root) { @@ -2887,15 +2891,19 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); rcu_read_lock(); + if (ret) + break; } rcu_read_unlock(); dput(prev); +out_deact: deactivate_super(sb); + return ret; } /** @@ -2915,6 +2923,7 @@ static void cgroup_cfts_commit(struct cgroup_subsys *ss, int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) @@ -2923,9 +2932,10 @@ int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - cgroup_cfts_commit(ss, cfts, true); - - return 0; + ret = cgroup_cfts_commit(ss, cfts, true); + if (ret) + cgroup_rm_cftypes(ss, cfts); + return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); -- cgit v1.3-8-gc7d7 From 628f7cd47ab758cae0353d1a6decf3d1459dca24 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: separate out cgroup_base_files[] handling out of cgroup_populate/clear_dir() cgroup_populate/clear_dir() currently take @base_files and adds and removes, respectively, cgroup_base_files[] to the directory. File additions and removals are being reorganized for proper error handling and more dynamic handling for the unified hierarchy, and mixing base and subsys file handling into the same functions gets a bit confusing. This patch moves base file handling out of cgroup_populate/clear_dir() into their users - cgroup_mount(), cgroup_create() and cgroup_destroy_locked(). Note that this changes the behavior of base file removal. If @base_files is %true, cgroup_clear_dir() used to delete files regardless of cftype until there's no files left. Now, only files with matching cfts are removed. As files can only be created by the base or registered cftypes, this shouldn't result in any behavior difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 46 +++++++++++++++++++--------------------------- 1 file changed, 19 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36c0ccc921f4..9835a097f3c0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -215,6 +215,8 @@ static u64 cgroup_serial_nr_next = 1; */ static int need_forkexit_callback __read_mostly; +static struct cftype cgroup_base_files[]; + static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -804,8 +806,7 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask); +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask); static const struct inode_operations cgroup_dir_inode_operations; static const struct file_operations proc_cgroupstats_operations; @@ -957,13 +958,11 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) } /** - * cgroup_clear_dir - selective removal of base and subsystem files + * cgroup_clear_dir - remove subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be removed * @subsys_mask: mask of the subsystem ids whose files should be removed */ -static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; @@ -974,10 +973,6 @@ static void cgroup_clear_dir(struct cgroup *cgrp, bool base_files, list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); } - if (base_files) { - while (!list_empty(&cgrp->files)) - cgroup_rm_file(cgrp, NULL); - } } /* @@ -1372,17 +1367,17 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) * this before rebind_subsystems, since rebind_subsystems may * change this hierarchy's subsys_list. */ - cgroup_clear_dir(cgrp, false, removed_mask); + cgroup_clear_dir(cgrp, removed_mask); ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) { /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, false, removed_mask); + cgroup_populate_dir(cgrp, removed_mask); goto out_unlock; } /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, false, added_mask); + cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1687,7 +1682,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(root->number_of_cgroups != 1); cred = override_creds(&init_cred); - cgroup_populate_dir(root_cgrp, true, root->subsys_mask); + cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + cgroup_populate_dir(root_cgrp, root->subsys_mask); revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); @@ -4172,23 +4168,14 @@ static struct cftype cgroup_base_files[] = { }; /** - * cgroup_populate_dir - selectively creation of files in a directory + * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup - * @base_files: true if the base files should be added * @subsys_mask: mask of the subsystem ids whose files should be added */ -static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, - unsigned long subsys_mask) +static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { - int err; struct cgroup_subsys *ss; - if (base_files) { - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); - if (err < 0) - return err; - } - /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { struct cftype_set *set; @@ -4410,7 +4397,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } - err = cgroup_populate_dir(cgrp, true, root->subsys_mask); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + if (err) + goto err_destroy; + + err = cgroup_populate_dir(cgrp, root->subsys_mask); if (err) goto err_destroy; @@ -4566,7 +4557,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. */ - cgroup_clear_dir(cgrp, true, cgrp->root->subsys_mask); + cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); + cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit v1.3-8-gc7d7 From bee550994f6b0c1179bd3ccea58dc5c2c4ccf842 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 16:24:11 -0700 Subject: cgroup: update error handling in cgroup_populate_dir() cgroup_populate_dir() didn't use to check whether the actual file creations were successful and could return success with only subset of the requested files created, which is nasty. This patch udpates cgroup_populate_dir() so that it either succeeds with all files or fails with no file. v2: The original patch also converted for_each_root_subsys() usages to for_each_subsys() without explaining why. That part has been moved to a separate patch. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9835a097f3c0..6b7324431b99 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4171,10 +4171,13 @@ static struct cftype cgroup_base_files[] = { * cgroup_populate_dir - create subsys files in a cgroup directory * @cgrp: target cgroup * @subsys_mask: mask of the subsystem ids whose files should be added + * + * On failure, no file is added. */ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int ret = 0; /* process cftsets of each subsystem */ for_each_root_subsys(cgrp->root, ss) { @@ -4182,8 +4185,11 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(ss->subsys_id, &subsys_mask)) continue; - list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, ss, set->cfts, true); + list_for_each_entry(set, &ss->cftsets, node) { + ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + if (ret < 0) + goto err; + } } /* This cgroup is ready now */ @@ -4201,6 +4207,9 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) } return 0; +err: + cgroup_clear_dir(cgrp, subsys_mask); + return ret; } static void css_dput_fn(struct work_struct *work) -- cgit v1.3-8-gc7d7 From b420ba7db15659253d4f286a0ba479d336371999 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 12:34:02 -0700 Subject: cgroup: use for_each_subsys() instead of for_each_root_subsys() in cgroup_populate/clear_dir() rebind_subsystems() will be updated to handle file creations and removals with proper error handling and to do that will need to perform file operations before actually adding the subsystem to the hierarchy. To enable such usage, update cgroup_populate/clear_dir() to use for_each_subsys() instead of for_each_root_subsys() so that they operate on all subsystems specified by @subsys_mask whether that subsystem is currently bound to the hierarchy or not. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6b7324431b99..8f70dc0c0c79 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -965,10 +965,12 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; + int i; - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) cgroup_addrm_files(cgrp, NULL, set->cfts, false); @@ -4177,12 +4179,13 @@ static struct cftype cgroup_base_files[] = { static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) { struct cgroup_subsys *ss; - int ret = 0; + int i, ret = 0; /* process cftsets of each subsystem */ - for_each_root_subsys(cgrp->root, ss) { + for_each_subsys(ss, i) { struct cftype_set *set; - if (!test_bit(ss->subsys_id, &subsys_mask)) + + if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) { -- cgit v1.3-8-gc7d7 From 3126121fb30941552b1a806c7c2e686bde57e270 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: make rebind_subsystems() handle file additions and removals with proper error handling Currently, creating and removing cgroup files in the root directory are handled separately from the actual subsystem binding and unbinding which happens in rebind_subsystems(). Also, rebind_subsystems() users aren't handling file creation errors properly. Let's integrate top_cgroup file handling into rebind_subsystems() so that it's simpler to use and everyone handles file creation errors correctly. * On a successful return, rebind_subsystems() is guaranteed to have created all files of the new subsystems and deleted the ones belonging to the removed subsystems. After a failure, no file is created or removed. * cgroup_remount() no longer needs to make explicit populate/clear calls as it's all handled by rebind_subsystems(), and it gets proper error handling automatically. * cgroup_mount() has been updated such that the root dentry and cgroup are linked before rebind_subsystems(). Also, the init_cred dancing and base file handling are moved right above rebind_subsystems() call and proper error handling for the base files is added. While at it, add a comment explaining what's going on with the cred thing. * cgroup_kill_sb() calls rebind_subsystems() to unbind all subsystems which now implies removing all subsystem files which requires the directory's i_mutex. Grab it. This means that files on the root cgroup are removed earlier - they used to be deleted from generic super_block cleanup from vfs. This doesn't lead to any functional difference and it's cleaner to do the clean up explicitly for all files. Combined with the previous changes, this makes all cgroup file creation errors handled correctly. v2: Added comment on init_cred. v3: Li spotted that cgroup_mount() wasn't freeing tmp_links after base file addition failure. Fix it by adding free_tmp_links error handling label. v4: v3 introduced build bugs which got noticed by Fengguang's awesome kbuild test robot. Fixed, and shame on me. Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Fengguang Wu --- kernel/cgroup.c | 73 ++++++++++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8f70dc0c0c79..4ec8d2da94d1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,7 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; - int i; + int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); @@ -1028,7 +1028,16 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (root->number_of_cgroups > 1) return -EBUSY; - /* Process each subsystem */ + ret = cgroup_populate_dir(cgrp, added_mask); + if (ret) + return ret; + + /* + * Nothing can fail from this point on. Remove files for the + * removed subsystems and rebind each subsystem. + */ + cgroup_clear_dir(cgrp, removed_mask); + for_each_subsys(ss, i) { unsigned long bit = 1UL << i; @@ -1364,22 +1373,9 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } - /* - * Clear out the files of subsystems that should be removed, do - * this before rebind_subsystems, since rebind_subsystems may - * change this hierarchy's subsys_list. - */ - cgroup_clear_dir(cgrp, removed_mask); - ret = rebind_subsystems(root, added_mask, removed_mask); - if (ret) { - /* rebind_subsystems failed, re-populate the removed files */ - cgroup_populate_dir(cgrp, removed_mask); + if (ret) goto out_unlock; - } - - /* re-populate subsystem files */ - cgroup_populate_dir(cgrp, added_mask); if (opts.release_agent) strcpy(root->release_agent_path, opts.release_agent); @@ -1578,7 +1574,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int ret = 0; struct super_block *sb; struct cgroupfs_root *new_root; + struct list_head tmp_links; struct inode *inode; + const struct cred *cred; /* First find the desired set of subsystems */ mutex_lock(&cgroup_mutex); @@ -1610,10 +1608,8 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!root); if (root == opts.new_root) { /* We used the new root structure, so this is a new hierarchy */ - struct list_head tmp_links; struct cgroup *root_cgrp = &root->top_cgroup; struct cgroupfs_root *existing_root; - const struct cred *cred; int i; struct css_set *cset; @@ -1651,26 +1647,37 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto unlock_drop; + sb->s_root->d_fsdata = root_cgrp; + root_cgrp->dentry = sb->s_root; + + /* + * We're inside get_sb() and will call lookup_one_len() to + * create the root files, which doesn't work if SELinux is + * in use. The following cred dancing somehow works around + * it. See 2ce9738ba ("cgroupfs: use init_cred when + * populating new cgroupfs mount") for more details. + */ + cred = override_creds(&init_cred); + + ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + if (ret) + goto rm_base_files; + ret = rebind_subsystems(root, root->subsys_mask, 0); - if (ret == -EBUSY) { - free_cgrp_cset_links(&tmp_links); - goto unlock_drop; - } + if (ret) + goto rm_base_files; + + revert_creds(cred); + /* * There must be no failure case after here, since rebinding * takes care of subsystems' refcounts, which are explicitly * dropped in the failure exit path. */ - /* EBUSY should be the only error here */ - BUG_ON(ret); - list_add(&root->root_list, &cgroup_roots); cgroup_root_count++; - sb->s_root->d_fsdata = root_cgrp; - root->top_cgroup.dentry = sb->s_root; - /* Link the top cgroup in this hierarchy into all * the css_set objects */ write_lock(&css_set_lock); @@ -1683,10 +1690,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, BUG_ON(!list_empty(&root_cgrp->children)); BUG_ON(root->number_of_cgroups != 1); - cred = override_creds(&init_cred); - cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); - cgroup_populate_dir(root_cgrp, root->subsys_mask); - revert_creds(cred); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -1715,6 +1718,10 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, kfree(opts.name); return dget(sb->s_root); + rm_base_files: + free_cgrp_cset_links(&tmp_links); + cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); mutex_unlock(&cgroup_root_mutex); @@ -1741,6 +1748,7 @@ static void cgroup_kill_sb(struct super_block *sb) { BUG_ON(root->number_of_cgroups != 1); BUG_ON(!list_empty(&cgrp->children)); + mutex_lock(&cgrp->dentry->d_inode->i_mutex); mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); @@ -1773,6 +1781,7 @@ static void cgroup_kill_sb(struct super_block *sb) { mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); + mutex_unlock(&cgrp->dentry->d_inode->i_mutex); simple_xattrs_free(&cgrp->xattrs); -- cgit v1.3-8-gc7d7 From f172e67cf9d842bc646d0f66792e38435a334b1e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 17:07:30 -0700 Subject: cgroup: move number_of_cgroups test out of rebind_subsystems() into cgroup_remount() rebind_subsystems() currently fails if the hierarchy has any !root cgroups; however, on the planned unified hierarchy, rebind_subsystems() will be used while populated. Move the test to cgroup_remount(), which is the only place the test is necessary anyway. As it's impossible for the other two callers of rebind_subsystems() to have populated hierarchy, this doesn't make any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ec8d2da94d1..c108d3d1ea30 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1021,13 +1021,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } } - /* Currently we don't handle adding/removing subsystems when - * any child cgroups exist. This is theoretically supportable - * but involves complex error handling, so it's being left until - * later */ - if (root->number_of_cgroups > 1) - return -EBUSY; - ret = cgroup_populate_dir(cgrp, added_mask); if (ret) return ret; @@ -1373,6 +1366,12 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) goto out_unlock; } + /* remounting is not allowed for populated hierarchies */ + if (root->number_of_cgroups > 1) { + ret = -EBUSY; + goto out_unlock; + } + ret = rebind_subsystems(root, added_mask, removed_mask); if (ret) goto out_unlock; -- cgit v1.3-8-gc7d7 From 1d5be6b287c8efc879fbe578e2b7bc8f7a38f313 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 12 Jul 2013 13:38:17 -0700 Subject: cgroup: move module ref handling into rebind_subsystems() Module ref handling in cgroup is rather weird. parse_cgroupfs_options() grabs all the modules for the specified subsystems. A module ref is kept if the specified subsystem is newly bound to the hierarchy. If not, or the operation fails, the refs are dropped. This scatters module ref handling across multiple functions making it difficult to track. It also make the function nasty to use for dynamic subsystem binding which is necessary for the planned unified hierarchy. There's nothing which requires the subsystem modules to be pinned between parse_cgroupfs_options() and rebind_subsystems() in both mount and remount paths. parse_cgroupfs_options() can just parse and rebind_subsystems() can handle pinning the subsystems that it wants to bind, which is a natural part of its task - binding - anyway. Move module ref handling into rebind_subsystems() which makes the code a lot simpler - modules are gotten iff it's gonna be bound and put iff unbound or binding fails. v2: Li pointed out that if a controller module is unloaded between parsing and binding, rebind_subsystems() won't notice the missing controller as it only iterates through existing controllers. Fix it by updating rebind_subsystems() to compare @added_mask to @pinned and fail with -ENOENT if they don't match. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 93 +++++++++++++++++---------------------------------------- 1 file changed, 28 insertions(+), 65 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c108d3d1ea30..2a8cf1a7d2f4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1003,6 +1003,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, { struct cgroup *cgrp = &root->top_cgroup; struct cgroup_subsys *ss; + unsigned long pinned = 0; int i, ret; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1010,20 +1011,32 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* Check that any added subsystems are currently free */ for_each_subsys(ss, i) { - unsigned long bit = 1UL << i; - - if (!(bit & added_mask)) + if (!(added_mask & (1 << i))) continue; + /* is the subsystem mounted elsewhere? */ if (ss->root != &cgroup_dummy_root) { - /* Subsystem isn't free */ - return -EBUSY; + ret = -EBUSY; + goto out_put; + } + + /* pin the module */ + if (!try_module_get(ss->module)) { + ret = -ENOENT; + goto out_put; } + pinned |= 1 << i; + } + + /* subsys could be missing if unloaded between parsing and here */ + if (added_mask != pinned) { + ret = -ENOENT; + goto out_put; } ret = cgroup_populate_dir(cgrp, added_mask); if (ret) - return ret; + goto out_put; /* * Nothing can fail from this point on. Remove files for the @@ -1067,11 +1080,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, } else if (bit & root->subsys_mask) { /* Subsystem state should already exist */ BUG_ON(!cgrp->subsys[i]); - /* - * a refcount was taken, but we already had one, so - * drop the extra reference. - */ - module_put(ss->module); #ifdef CONFIG_MODULE_UNLOAD BUG_ON(ss->module && !module_refcount(ss->module)); #endif @@ -1088,6 +1096,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, root->flags |= CGRP_ROOT_SUBSYS_BOUND; return 0; + +out_put: + for_each_subsys(ss, i) + if (pinned & (1 << i)) + module_put(ss->module); + return ret; } static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) @@ -1138,7 +1152,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) char *token, *o = data; bool all_ss = false, one_ss = false; unsigned long mask = (unsigned long)-1; - bool module_pin_failed = false; struct cgroup_subsys *ss; int i; @@ -1281,52 +1294,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) if (!opts->subsys_mask && !opts->name) return -EINVAL; - /* - * Grab references on all the modules we'll need, so the subsystems - * don't dance around before rebind_subsystems attaches them. This may - * take duplicate reference counts on a subsystem that's already used, - * but rebind_subsystems handles this case. - */ - for_each_subsys(ss, i) { - if (!(opts->subsys_mask & (1UL << i))) - continue; - if (!try_module_get(cgroup_subsys[i]->module)) { - module_pin_failed = true; - break; - } - } - if (module_pin_failed) { - /* - * oops, one of the modules was going away. this means that we - * raced with a module_delete call, and to the user this is - * essentially a "subsystem doesn't exist" case. - */ - for (i--; i >= 0; i--) { - /* drop refcounts only on the ones we took */ - unsigned long bit = 1UL << i; - - if (!(bit & opts->subsys_mask)) - continue; - module_put(cgroup_subsys[i]->module); - } - return -ENOENT; - } - return 0; } -static void drop_parsed_module_refcounts(unsigned long subsys_mask) -{ - struct cgroup_subsys *ss; - int i; - - mutex_lock(&cgroup_mutex); - for_each_subsys(ss, i) - if (subsys_mask & (1UL << i)) - module_put(cgroup_subsys[i]->module); - mutex_unlock(&cgroup_mutex); -} - static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1384,8 +1354,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); mutex_unlock(&cgrp->dentry->d_inode->i_mutex); - if (ret) - drop_parsed_module_refcounts(opts.subsys_mask); return ret; } @@ -1591,7 +1559,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto drop_modules; + goto out_err; } opts.new_root = new_root; @@ -1600,7 +1568,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_free_root(opts.new_root); - goto drop_modules; + goto out_err; } root = sb->s_fs_info; @@ -1708,9 +1676,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, pr_warning("cgroup: new mount options do not match the existing superblock, will be ignored\n"); } } - - /* no subsys rebinding, so refcounts don't change */ - drop_parsed_module_refcounts(opts.subsys_mask); } kfree(opts.release_agent); @@ -1728,8 +1693,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_unlock(&inode->i_mutex); drop_new_super: deactivate_locked_super(sb); - drop_modules: - drop_parsed_module_refcounts(opts.subsys_mask); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -4837,7 +4800,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) /* * we shouldn't be called if the subsystem is in use, and the use of - * try_module_get in parse_cgroupfs_options should ensure that it + * try_module_get() in rebind_subsystems() should ensure that it * doesn't start being used while we're killing it off. */ BUG_ON(ss->root != &cgroup_dummy_root); -- cgit v1.3-8-gc7d7 From a698b4488ab98deef6c3beeba3e27fea17650132 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Jun 2013 21:08:27 -0700 Subject: cgroup: remove gratuituous BUG_ON()s from rebind_subsystems() rebind_subsystems() performs santiy checks even on subsystems which aren't specified to be added or removed and the checks aren't all that useful given that these are in a very cold path while the violations they check would trip up in much hotter paths. Let's remove these from rebind_subsystems(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2a8cf1a7d2f4..345fac8e4fba 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1077,15 +1077,6 @@ static int rebind_subsystems(struct cgroupfs_root *root, /* subsystem is now free - drop reference on module */ module_put(ss->module); root->subsys_mask &= ~bit; - } else if (bit & root->subsys_mask) { - /* Subsystem state should already exist */ - BUG_ON(!cgrp->subsys[i]); -#ifdef CONFIG_MODULE_UNLOAD - BUG_ON(ss->module && !module_refcount(ss->module)); -#endif - } else { - /* Subsystem state shouldn't exist */ - BUG_ON(cgrp->subsys[i]); } } -- cgit v1.3-8-gc7d7 From fd4363fff3d96795d3feb1b3fb48ce590f186bdd Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 12 Jul 2013 11:21:48 +0200 Subject: x86: Introduce int3 (breakpoint)-based instruction patching Introduce a method for run-time instruction patching on a live SMP kernel based on int3 breakpoint, completely avoiding the need for stop_machine(). The way this is achieved: - add a int3 trap to the address that will be patched - sync cores - update all but the first byte of the patched range - sync cores - replace the first byte (int3) by the first byte of replacing opcode - sync cores According to http://lkml.indiana.edu/hypermail/linux/kernel/1001.1/01530.html synchronization after replacing "all but first" instructions should not be necessary (on Intel hardware), as the syncing after the subsequent patching of the first byte provides enough safety. But there's not only Intel HW out there, and we'd rather be on a safe side. If any CPU instruction execution would collide with the patching, it'd be trapped by the int3 breakpoint and redirected to the provided "handler" (which would typically mean just skipping over the patched region, acting as "nop" has been there, in case we are doing nop -> jump and jump -> nop transitions). Ftrace has been using this very technique since 08d636b ("ftrace/x86: Have arch x86_64 use breakpoints instead of stop machine") for ages already, and jump labels are another obvious potential user of this. Based on activities of Masami Hiramatsu a few years ago. Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Signed-off-by: Jiri Kosina Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1307121102440.29788@pobox.suse.cz Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/alternative.h | 1 + arch/x86/kernel/alternative.c | 106 +++++++++++++++++++++++++++++++++++++ kernel/kprobes.c | 2 +- 3 files changed, 108 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 58ed6d96a6ac..3abf8dd6f44f 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -233,6 +233,7 @@ struct text_poke_param { }; extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); extern void *text_poke_smp(void *addr, const void *opcode, size_t len); extern void text_poke_smp_batch(struct text_poke_param *params, int n); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c15cf9a25e27..0ab49366a7a6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -596,6 +597,111 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len) return addr; } +static void do_sync_core(void *info) +{ + sync_core(); +} + +static bool bp_patching_in_progress; +static void *bp_int3_handler, *bp_int3_addr; + +static int int3_notify(struct notifier_block *self, unsigned long val, void *data) +{ + struct die_args *args = data; + + /* bp_patching_in_progress */ + smp_rmb(); + + if (likely(!bp_patching_in_progress)) + return NOTIFY_DONE; + + /* we are not interested in non-int3 faults and ring > 0 faults */ + if (val != DIE_INT3 || !args->regs || user_mode_vm(args->regs) + || args->regs->ip != (unsigned long)bp_int3_addr) + return NOTIFY_DONE; + + /* set up the specified breakpoint handler */ + args->regs->ip = (unsigned long) bp_int3_handler; + + return NOTIFY_STOP; +} +/** + * text_poke_bp() -- update instructions on live kernel on SMP + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @handler: address to jump to when the temporary breakpoint is hit + * + * Modify multi-byte instruction by using int3 breakpoint on SMP. + * In contrary to text_poke_smp(), we completely avoid stop_machine() here, + * and achieve the synchronization using int3 breakpoint. + * + * The way it is done: + * - add a int3 trap to the address that will be patched + * - sync cores + * - update all but the first byte of the patched range + * - sync cores + * - replace the first byte (int3) by the first byte of + * replacing opcode + * - sync cores + * + * Note: must be called under text_mutex. + */ +void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) +{ + unsigned char int3 = 0xcc; + + bp_int3_handler = handler; + bp_int3_addr = (u8 *)addr + sizeof(int3); + bp_patching_in_progress = true; + /* + * Corresponding read barrier in int3 notifier for + * making sure the in_progress flags is correctly ordered wrt. + * patching + */ + smp_wmb(); + + text_poke(addr, &int3, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + if (len - sizeof(int3) > 0) { + /* patch all but the first byte */ + text_poke((char *)addr + sizeof(int3), + (const char *) opcode + sizeof(int3), + len - sizeof(int3)); + /* + * According to Intel, this core syncing is very likely + * not necessary and we'd be safe even without it. But + * better safe than sorry (plus there's not only Intel). + */ + on_each_cpu(do_sync_core, NULL, 1); + } + + /* patch the first byte */ + text_poke(addr, opcode, sizeof(int3)); + + on_each_cpu(do_sync_core, NULL, 1); + + bp_patching_in_progress = false; + smp_wmb(); + + return addr; +} + +/* this one needs to run before anything else handles it as a + * regular exception */ +static struct notifier_block int3_nb = { + .priority = 0x7fffffff, + .notifier_call = int3_notify +}; + +static int __init int3_init(void) +{ + return register_die_notifier(&int3_nb); +} + +arch_initcall(int3_init); /* * Cross-modifying kernel text with stop_machine(). * This code originally comes from immediate value. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6e33498d665c..b58b490fa439 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1709,7 +1709,7 @@ EXPORT_SYMBOL_GPL(unregister_kprobes); static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, - .priority = 0x7fffffff /* we need to be notified first */ + .priority = 0x7ffffff0 /* High priority, but not first. */ }; unsigned long __weak arch_deref_entry_point(void *entry) -- cgit v1.3-8-gc7d7 From e04c5d76b0cfb66cadd900cf147526f2271884b8 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Jul 2013 22:21:57 -0300 Subject: remove sched notifier for cross-cpu migrations Linux as a guest on KVM hypervisor, the only user of the pvclock vsyscall interface, does not require notification on task migration because: 1. cpu ID number maps 1:1 to per-CPU pvclock time info. 2. per-CPU pvclock time info is updated if the underlying CPU changes. 3. that version is increased whenever underlying CPU changes. Which is sufficient to guarantee nanoseconds counter is calculated properly. Signed-off-by: Marcelo Tosatti Acked-by: Peter Zijlstra Signed-off-by: Gleb Natapov --- arch/x86/include/asm/pvclock.h | 1 - arch/x86/kernel/pvclock.c | 44 ------------------------------------------ arch/x86/vdso/vclock_gettime.c | 16 +++++++-------- include/linux/sched.h | 8 -------- kernel/sched/core.c | 15 -------------- 5 files changed, 8 insertions(+), 76 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 109a9dd5d454..be8269b00e2a 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -93,7 +93,6 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; - u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2cb9470ea85b..a16bae3f83b3 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -128,46 +128,7 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } -static struct pvclock_vsyscall_time_info *pvclock_vdso_info; - -static struct pvclock_vsyscall_time_info * -pvclock_get_vsyscall_user_time_info(int cpu) -{ - if (!pvclock_vdso_info) { - BUG(); - return NULL; - } - - return &pvclock_vdso_info[cpu]; -} - -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) -{ - return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; -} - #ifdef CONFIG_X86_64 -static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, - void *v) -{ - struct task_migration_notifier *mn = v; - struct pvclock_vsyscall_time_info *pvti; - - pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); - - /* this is NULL when pvclock vsyscall is not initialized */ - if (unlikely(pvti == NULL)) - return NOTIFY_DONE; - - pvti->migrate_count++; - - return NOTIFY_DONE; -} - -static struct notifier_block pvclock_migrate = { - .notifier_call = pvclock_task_migrate, -}; - /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -181,17 +142,12 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - pvclock_vdso_info = i; - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } - - register_task_migration_notifier(&pvclock_migrate); - return 0; } #endif diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index c74436e687bf..72074d528400 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -85,15 +85,18 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; - u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * When looping to get a consistent (time-info, tsc) pair, we - * also need to deal with the possibility we can switch vcpus, - * so make sure we always re-fetch time-info for the current vcpu. + * Note: hypervisor must guarantee that: + * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. + * 2. that per-CPU pvclock time info is updated if the + * underlying CPU changes. + * 3. that version is increased whenever underlying CPU + * changes. + * */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,8 +107,6 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); - migrate_count = pvti->migrate_count; - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -117,8 +118,7 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version || - pvti->migrate_count != migrate_count)); + pvti->pvti.version != version)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d04b92ceda..bfc809d51745 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -107,14 +107,6 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); -/* Notifier for when a task gets migrated to a new CPU */ -struct task_migration_notifier { - struct task_struct *task; - int from_cpu; - int to_cpu; -}; -extern void register_task_migration_notifier(struct notifier_block *n); - extern unsigned long get_parent_ip(unsigned long addr); extern void dump_cpu_task(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0d8eb4525e76..0efd2eefb027 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -976,13 +976,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq->skip_clock_update = 1; } -static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); - -void register_task_migration_notifier(struct notifier_block *n) -{ - atomic_notifier_chain_register(&task_migration_notifier, n); -} - #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -1013,18 +1006,10 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { - struct task_migration_notifier tmn; - if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0); - - tmn.task = p; - tmn.from_cpu = task_cpu(p); - tmn.to_cpu = new_cpu; - - atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); -- cgit v1.3-8-gc7d7 From 87e3c8ae1c8676b9dd56b56456dafa14a4bacf97 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Sun, 21 Jul 2013 04:32:07 +0400 Subject: sched/fair: Cleanup: remove duplicate variable declaration cfs_rq is declared twice, fix it. Also use 'se' instead of '&p->se'. Signed-off-by: Kirill Tkhai Cc: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/169201374366727@web6d.yandex.ru Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb456f44b7b1..ab599781129d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5889,11 +5889,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) * and ensure we don't carry in an old decay_count if we * switch back. */ - if (p->se.avg.decay_count) { - struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); - __synchronize_entity_decay(&p->se); - subtract_blocked_load_contrib(cfs_rq, - p->se.avg.load_avg_contrib); + if (se->avg.decay_count) { + __synchronize_entity_decay(se); + subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib); } #endif } -- cgit v1.3-8-gc7d7 From 1e40c2edef2537f87f94d0baf80aeaeb7d51cc23 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 19 Jul 2013 20:31:01 +0200 Subject: mutex: Fix/document access-once assumption in mutex_can_spin_on_owner() mutex_can_spin_on_owner() is technically broken in that it would in theory allow the compiler to load lock->owner twice, seeing a pointer first time and a NULL pointer the second time. Linus pointed out that a compiler has to be seriously broken to not compile this correctly - but nevertheless this change is correct as it will better document the implementation. Signed-off-by: Peter Zijlstra Acked-by: Davidlohr Bueso Acked-by: Waiman Long Acked-by: Linus Torvalds Acked-by: Thomas Gleixner Acked-by: Rik van Riel Cc: Paul E. McKenney Cc: David Howells Link: http://lkml.kernel.org/r/20130719183101.GA20909@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/mutex.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index ff05f4bd86eb..7ff48c55a98b 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -209,11 +209,13 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ static inline int mutex_can_spin_on_owner(struct mutex *lock) { + struct task_struct *owner; int retval = 1; rcu_read_lock(); - if (lock->owner) - retval = lock->owner->on_cpu; + owner = ACCESS_ONCE(lock->owner); + if (owner) + retval = owner->on_cpu; rcu_read_unlock(); /* * if lock->owner is not set, the mutex owner may have just acquired -- cgit v1.3-8-gc7d7 From 17f41571bb2c4a398785452ac2718a6c5d77180e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 23 Jul 2013 10:09:28 +0200 Subject: kprobes/x86: Call out into INT3 handler directly instead of using notifier In fd4363fff3d96 ("x86: Introduce int3 (breakpoint)-based instruction patching"), the mechanism that was introduced for notifying alternatives code from int3 exception handler that and exception occured was die_notifier. This is however problematic, as early code might be using jump labels even before the notifier registration has been performed, which will then lead to an oops due to unhandled exception. One of such occurences has been encountered by Fengguang: int3: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.11.0-rc1-01429-g04bf576 #8 task: ffff88000da1b040 ti: ffff88000da1c000 task.ti: ffff88000da1c000 RIP: 0010:[] [] ttwu_do_wakeup+0x28/0x225 RSP: 0000:ffff88000dd03f10 EFLAGS: 00000006 RAX: 0000000000000000 RBX: ffff88000dd12940 RCX: ffffffff81769c40 RDX: 0000000000000002 RSI: 0000000000000000 RDI: 0000000000000001 RBP: ffff88000dd03f28 R08: ffffffff8176a8c0 R09: 0000000000000002 R10: ffffffff810ff484 R11: ffff88000dd129e8 R12: ffff88000dbc90c0 R13: ffff88000dbc90c0 R14: ffff88000da1dfd8 R15: ffff88000da1dfd8 FS: 0000000000000000(0000) GS:ffff88000dd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00000000ffffffff CR3: 0000000001c88000 CR4: 00000000000006e0 Stack: ffff88000dd12940 ffff88000dbc90c0 ffff88000da1dfd8 ffff88000dd03f48 ffffffff81109e2b ffff88000dd12940 0000000000000000 ffff88000dd03f68 ffffffff81109e9e 0000000000000000 0000000000012940 ffff88000dd03f98 Call Trace: [] ttwu_do_activate.constprop.56+0x6d/0x79 [] sched_ttwu_pending+0x67/0x84 [] scheduler_ipi+0x15a/0x2b0 [] smp_reschedule_interrupt+0x38/0x41 [] reschedule_interrupt+0x6d/0x80 [] ? __atomic_notifier_call_chain+0x5/0xc1 [] ? native_safe_halt+0xd/0x16 [] default_idle+0x147/0x282 [] arch_cpu_idle+0x3d/0x5d [] cpu_idle_loop+0x46d/0x5db [] cpu_startup_entry+0x84/0x84 [] start_secondary+0x3c8/0x3d5 [...] Fix this by directly calling poke_int3_handler() from the int3 exception handler (analogically to what ftrace has been doing already), instead of relying on notifier, registration of which might not have yet been finalized by the time of the first trap. Reported-and-tested-by: Fengguang Wu Signed-off-by: Jiri Kosina Acked-by: Masami Hiramatsu Cc: H. Peter Anvin Cc: Fengguang Wu Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1307231007490.14024@pobox.suse.cz Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 2 ++ arch/x86/kernel/alternative.c | 31 ++++++++----------------------- arch/x86/kernel/traps.c | 4 ++++ kernel/kprobes.c | 2 +- 4 files changed, 15 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4daf8c501239..0a3f9c9f98d5 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -5,6 +5,7 @@ #include #include #include +#include /* * Alternative inline assembly for SMP. @@ -224,6 +225,7 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); * inconsistent instruction while you patch. */ extern void *text_poke(void *addr, const void *opcode, size_t len); +extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); #endif /* _ASM_X86_ALTERNATIVE_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5d8782ee3dd7..15e8563e5c24 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -605,26 +605,24 @@ static void do_sync_core(void *info) static bool bp_patching_in_progress; static void *bp_int3_handler, *bp_int3_addr; -static int int3_notify(struct notifier_block *self, unsigned long val, void *data) +int poke_int3_handler(struct pt_regs *regs) { - struct die_args *args = data; - /* bp_patching_in_progress */ smp_rmb(); if (likely(!bp_patching_in_progress)) - return NOTIFY_DONE; + return 0; - /* we are not interested in non-int3 faults and ring > 0 faults */ - if (val != DIE_INT3 || !args->regs || user_mode_vm(args->regs) - || args->regs->ip != (unsigned long)bp_int3_addr) - return NOTIFY_DONE; + if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + return 0; /* set up the specified breakpoint handler */ - args->regs->ip = (unsigned long) bp_int3_handler; + regs->ip = (unsigned long) bp_int3_handler; + + return 1; - return NOTIFY_STOP; } + /** * text_poke_bp() -- update instructions on live kernel on SMP * @addr: address to patch @@ -689,16 +687,3 @@ void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler) return addr; } -/* this one needs to run before anything else handles it as a - * regular exception */ -static struct notifier_block int3_nb = { - .priority = 0x7fffffff, - .notifier_call = int3_notify -}; - -static int __init int3_init(void) -{ - return register_die_notifier(&int3_nb); -} - -arch_initcall(int3_init); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1b23a1c92746..8c8093b146ca 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -58,6 +58,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include @@ -327,6 +328,9 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co ftrace_int3_handler(regs)) return; #endif + if (poke_int3_handler(regs)) + return; + prev_state = exception_enter(); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index b58b490fa439..6e33498d665c 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1709,7 +1709,7 @@ EXPORT_SYMBOL_GPL(unregister_kprobes); static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, - .priority = 0x7ffffff0 /* High priority, but not first. */ + .priority = 0x7fffffff /* we need to be notified first */ }; unsigned long __weak arch_deref_entry_point(void *entry) -- cgit v1.3-8-gc7d7 From ec83f425dbca47e19c6737e8e7db0d0924a5de1b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 28 Jun 2013 13:13:18 -0700 Subject: mutex: Do not unnecessarily deal with waiters Upon entering the slowpath, we immediately attempt to acquire the lock by checking if it is already unlocked. If we are lucky enough that this is the case, then we don't need to deal with any waiter related logic. Furthermore any checks for an empty wait_list are unnecessary as we already know that count is non-negative and hence no one is waiting for the lock. Move the count check and xchg calls to be done before any waiters are setup - including waiter debugging. Upon failure to acquire the lock, the xchg sets the counter to 0, instead of -1 as it was originally. This can be done here since we set it back to -1 right at the beginning of the loop so other waiters are woken up when the lock is released. When tested on a 8-socket (80 core) system against a vanilla 3.10-rc1 kernel, this patch provides some small performance benefits (+2-6%). While it could be considered in the noise level, the average percentages were stable across multiple runs and no performance regressions were seen. Two big winners, for small amounts of users (10-100), were the short and compute workloads had a +19.36% and +%15.76% in jobs per minute. Also change some break statements to 'goto slowpath', which IMO makes a little more intuitive to read. Signed-off-by: Davidlohr Bueso Acked-by: Rik van Riel Acked-by: Maarten Lankhorst Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1372450398.2106.1.camel@buesod1.americas.hpqcorp.net Signed-off-by: Ingo Molnar --- kernel/mutex.c | 41 ++++++++++++++++++----------------------- 1 file changed, 18 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 7ff48c55a98b..386ad5da47a5 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -463,7 +463,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * performed the optimistic spinning cannot be done. */ if (ACCESS_ONCE(ww->ctx)) - break; + goto slowpath; } /* @@ -474,7 +474,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, owner = ACCESS_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) { mspin_unlock(MLOCK(lock), &node); - break; + goto slowpath; } if ((atomic_read(&lock->count) == 1) && @@ -489,8 +489,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); - preempt_enable(); - return 0; + goto done; } mspin_unlock(MLOCK(lock), &node); @@ -501,7 +500,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * the owner complete. */ if (!owner && (need_resched() || rt_task(task))) - break; + goto slowpath; /* * The cpu_relax() call is a compiler barrier which forces @@ -515,6 +514,10 @@ slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); + /* once more, can we acquire the lock? */ + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, 0) == 1)) + goto skip_wait; + debug_mutex_lock_common(lock, &waiter); debug_mutex_add_waiter(lock, &waiter, task_thread_info(task)); @@ -522,9 +525,6 @@ slowpath: list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) - goto done; - lock_contended(&lock->dep_map, ip); for (;;) { @@ -538,7 +538,7 @@ slowpath: * other waiters: */ if (MUTEX_SHOW_NO_WAITER(lock) && - (atomic_xchg(&lock->count, -1) == 1)) + (atomic_xchg(&lock->count, -1) == 1)) break; /* @@ -563,24 +563,25 @@ slowpath: schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); } + mutex_remove_waiter(lock, &waiter, current_thread_info()); + /* set it to 0 if there are no waiters left: */ + if (likely(list_empty(&lock->wait_list))) + atomic_set(&lock->count, 0); + debug_mutex_free_waiter(&waiter); -done: +skip_wait: + /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - /* got the lock - rejoice! */ - mutex_remove_waiter(lock, &waiter, current_thread_info()); mutex_set_owner(lock); if (!__builtin_constant_p(ww_ctx == NULL)) { - struct ww_mutex *ww = container_of(lock, - struct ww_mutex, - base); + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct mutex_waiter *cur; /* * This branch gets optimized out for the common case, * and is only important for ww_mutex_lock. */ - ww_mutex_lock_acquired(ww, ww_ctx); ww->ctx = ww_ctx; @@ -594,15 +595,9 @@ done: } } - /* set it to 0 if there are no waiters left: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - spin_unlock_mutex(&lock->wait_lock, flags); - - debug_mutex_free_waiter(&waiter); +done: preempt_enable(); - return 0; err: -- cgit v1.3-8-gc7d7 From a5cdd40c9877e9aba704c020fd65d26b5cfecf18 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 16 Jul 2013 17:09:07 +0200 Subject: perf: Update perf_event_type documentation Due to a discussion with Adrian I had a good look at the perf_event_type record layout and found the documentation to be somewhat unclear. Cc: Adrian Hunter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130716150907.GL23818@dyad.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 18 +++++++++++++++++- kernel/events/core.c | 31 ++++++++++++++++--------------- 2 files changed, 33 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 0b1df41691e8..00d8274730b4 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -478,6 +478,16 @@ enum perf_event_type { * file will be supported by older perf tools, with these new optional * fields being ignored. * + * struct sample_id { + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * } && perf_event_attr::sample_id_all + */ + + /* * The MMAP events record the PROT_EXEC mappings so that we can * correlate userspace IPs to code. They have the following structure: * @@ -498,6 +508,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 id; * u64 lost; + * struct sample_id sample_id; * }; */ PERF_RECORD_LOST = 2, @@ -508,6 +519,7 @@ enum perf_event_type { * * u32 pid, tid; * char comm[]; + * struct sample_id sample_id; * }; */ PERF_RECORD_COMM = 3, @@ -518,6 +530,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; + * struct sample_id sample_id; * }; */ PERF_RECORD_EXIT = 4, @@ -528,6 +541,7 @@ enum perf_event_type { * u64 time; * u64 id; * u64 stream_id; + * struct sample_id sample_id; * }; */ PERF_RECORD_THROTTLE = 5, @@ -539,6 +553,7 @@ enum perf_event_type { * u32 pid, ppid; * u32 tid, ptid; * u64 time; + * struct sample_id sample_id; * }; */ PERF_RECORD_FORK = 7, @@ -549,6 +564,7 @@ enum perf_event_type { * u32 pid, tid; * * struct read_format values; + * struct sample_id sample_id; * }; */ PERF_RECORD_READ = 8, @@ -596,7 +612,7 @@ enum perf_event_type { * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * * { u64 weight; } && PERF_SAMPLE_WEIGHT - * { u64 data_src; } && PERF_SAMPLE_DATA_SRC + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 5e2bce90b477..127411400116 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4462,20 +4462,6 @@ void perf_output_sample(struct perf_output_handle *handle, } } - if (!event->attr.watermark) { - int wakeup_events = event->attr.wakeup_events; - - if (wakeup_events) { - struct ring_buffer *rb = handle->rb; - int events = local_inc_return(&rb->events); - - if (events >= wakeup_events) { - local_sub(wakeup_events, &rb->events); - local_inc(&rb->wakeup); - } - } - } - if (sample_type & PERF_SAMPLE_BRANCH_STACK) { if (data->br_stack) { size_t size; @@ -4511,16 +4497,31 @@ void perf_output_sample(struct perf_output_handle *handle, } } - if (sample_type & PERF_SAMPLE_STACK_USER) + if (sample_type & PERF_SAMPLE_STACK_USER) { perf_output_sample_ustack(handle, data->stack_user_size, data->regs_user.regs); + } if (sample_type & PERF_SAMPLE_WEIGHT) perf_output_put(handle, data->weight); if (sample_type & PERF_SAMPLE_DATA_SRC) perf_output_put(handle, data->data_src.val); + + if (!event->attr.watermark) { + int wakeup_events = event->attr.wakeup_events; + + if (wakeup_events) { + struct ring_buffer *rb = handle->rb; + int events = local_inc_return(&rb->events); + + if (events >= wakeup_events) { + local_sub(wakeup_events, &rb->events); + local_inc(&rb->wakeup); + } + } + } } void perf_prepare_sample(struct perf_event_header *header, -- cgit v1.3-8-gc7d7 From 685207963be973fbb73550db6edaf920a283e1a7 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 15 Jul 2013 17:49:19 +0400 Subject: sched: Move h_load calculation to task_h_load() The bad thing about update_h_load(), which computes hierarchical load factor for task groups, is that it is called for each task group in the system before every load balancer run, and since rebalance can be triggered very often, this function can eat really a lot of cpu time if there are many cpu cgroups in the system. Although the situation was improved significantly by commit a35b646 ('sched, cgroup: Reduce rq->lock hold times for large cgroup hierarchies'), the problem still can arise under some kinds of loads, e.g. when cpus are switching from idle to busy and back very frequently. For instance, when I start 1000 of processes that wake up every millisecond on my 8 cpus host, 'top' and 'perf top' show: Cpu(s): 17.8%us, 24.3%sy, 0.0%ni, 57.9%id, 0.0%wa, 0.0%hi, 0.0%si Events: 243K cycles 7.57% [kernel] [k] __schedule 7.08% [kernel] [k] timerqueue_add 6.13% libc-2.12.so [.] usleep Then if I create 10000 *idle* cpu cgroups (no processes in them), cpu usage increases significantly although the 'wakers' are still executing in the root cpu cgroup: Cpu(s): 19.1%us, 48.7%sy, 0.0%ni, 31.6%id, 0.0%wa, 0.0%hi, 0.7%si Events: 230K cycles 24.56% [kernel] [k] tg_load_down 5.76% [kernel] [k] __schedule This happens because this particular kind of load triggers 'new idle' rebalance very frequently, which requires calling update_h_load(), which, in turn, calls tg_load_down() for every *idle* cpu cgroup even though it is absolutely useless, because idle cpu cgroups have no tasks to pull. This patch tries to improve the situation by making h_load calculation proceed only when h_load is really necessary. To achieve this, it substitutes update_h_load() with update_cfs_rq_h_load(), which computes h_load only for a given cfs_rq and all its ascendants, and makes the load balancer call this function whenever it considers if a task should be pulled, i.e. it moves h_load calculations directly to task_h_load(). For h_load of the same cfs_rq not to be updated multiple times (in case several tasks in the same cgroup are considered during the same balance run), the patch keeps the time of the last h_load update for each cfs_rq and breaks calculation when it finds h_load to be uptodate. The benefit of it is that h_load is computed only for those cfs_rq's, which really need it, in particular all idle task groups are skipped. Although this, in fact, moves h_load calculation under rq lock, it should not affect latency much, because the amount of work done under rq lock while trying to pull tasks is limited by sched_nr_migrate. After the patch applied with the setup described above (1000 wakers in the root cgroup and 10000 idle cgroups), I get: Cpu(s): 16.9%us, 24.8%sy, 0.0%ni, 58.4%id, 0.0%wa, 0.0%hi, 0.0%si Events: 242K cycles 7.57% [kernel] [k] __schedule 6.70% [kernel] [k] timerqueue_add 5.93% libc-2.12.so [.] usleep Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1373896159-1278-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 58 ++++++++++++++++++++++++---------------------------- kernel/sched/sched.h | 7 +++---- 2 files changed, 30 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bb456f44b7b1..765d87acdf05 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4171,47 +4171,48 @@ static void update_blocked_averages(int cpu) } /* - * Compute the cpu's hierarchical load factor for each task group. + * Compute the hierarchical load factor for cfs_rq and all its ascendants. * This needs to be done in a top-down fashion because the load of a child * group is a fraction of its parents load. */ -static int tg_load_down(struct task_group *tg, void *data) +static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) { - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->avg.load_avg_contrib; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib, - tg->parent->cfs_rq[cpu]->runnable_load_avg + 1); - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - struct rq *rq = cpu_rq(cpu); + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; unsigned long now = jiffies; + unsigned long load; - if (rq->h_load_throttle == now) + if (cfs_rq->last_h_load_update == now) return; - rq->h_load_throttle = now; + cfs_rq->h_load_next = NULL; + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + cfs_rq->h_load_next = se; + if (cfs_rq->last_h_load_update == now) + break; + } - rcu_read_lock(); - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); - rcu_read_unlock(); + if (!se) { + cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->last_h_load_update = now; + } + + while ((se = cfs_rq->h_load_next) != NULL) { + load = cfs_rq->h_load; + load = div64_ul(load * se->avg.load_avg_contrib, + cfs_rq->runnable_load_avg + 1); + cfs_rq = group_cfs_rq(se); + cfs_rq->h_load = load; + cfs_rq->last_h_load_update = now; + } } static unsigned long task_h_load(struct task_struct *p) { struct cfs_rq *cfs_rq = task_cfs_rq(p); + update_cfs_rq_h_load(cfs_rq); return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load, cfs_rq->runnable_load_avg + 1); } @@ -4220,10 +4221,6 @@ static inline void update_blocked_averages(int cpu) { } -static inline void update_h_load(long cpu) -{ -} - static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg_contrib; @@ -5108,7 +5105,6 @@ redo: env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); - update_h_load(env.src_cpu); more_balance: local_irq_save(flags); double_rq_lock(env.dst_rq, busiest); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..5e129efb84ce 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -285,7 +285,6 @@ struct cfs_rq { /* Required to track per-cpu representation of a task_group */ u32 tg_runnable_contrib; unsigned long tg_load_contrib; -#endif /* CONFIG_FAIR_GROUP_SCHED */ /* * h_load = weight * f(tg) @@ -294,6 +293,9 @@ struct cfs_rq { * this group. */ unsigned long h_load; + u64 last_h_load_update; + struct sched_entity *h_load_next; +#endif /* CONFIG_FAIR_GROUP_SCHED */ #endif /* CONFIG_SMP */ #ifdef CONFIG_FAIR_GROUP_SCHED @@ -429,9 +431,6 @@ struct rq { #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; -#ifdef CONFIG_SMP - unsigned long h_load_throttle; -#endif /* CONFIG_SMP */ #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -- cgit v1.3-8-gc7d7 From 62470419e993f8d9d93db0effd3af4296ecb79a5 Mon Sep 17 00:00:00 2001 From: Michael Wang Date: Thu, 4 Jul 2013 12:55:51 +0800 Subject: sched: Implement smarter wake-affine logic The wake-affine scheduler feature is currently always trying to pull the wakee close to the waker. In theory this should be beneficial if the waker's CPU caches hot data for the wakee, and it's also beneficial in the extreme ping-pong high context switch rate case. Testing shows it can benefit hackbench up to 15%. However, the feature is somewhat blind, from which some workloads such as pgbench suffer. It's also time-consuming algorithmically. Testing shows it can damage pgbench up to 50% - far more than the benefit it brings in the best case. So wake-affine should be smarter and it should realize when to stop its thankless effort at trying to find a suitable CPU to wake on. This patch introduces 'wakee_flips', which will be increased each time the task flips (switches) its wakee target. So a high 'wakee_flips' value means the task has more than one wakee, and the bigger the number, the higher the wakeup frequency. Now when making the decision on whether to pull or not, pay attention to the wakee with a high 'wakee_flips', pulling such a task may benefit the wakee. Also imply that the waker will face cruel competition later, it could be very cruel or very fast depends on the story behind 'wakee_flips', waker therefore suffers. Furthermore, if waker also has a high 'wakee_flips', that implies that multiple tasks rely on it, then waker's higher latency will damage all of them, so pulling wakee seems to be a bad deal. Thus, when 'waker->wakee_flips / wakee->wakee_flips' becomes higher and higher, the cost of pulling seems to be worse and worse. The patch therefore helps the wake-affine feature to stop its pulling work when: wakee->wakee_flips > factor && waker->wakee_flips > (factor * wakee->wakee_flips) The 'factor' here is the number of CPUs in the current CPU's NUMA node, so a bigger node will lead to more pulling since the trial becomes more severe. After applying the patch, pgbench shows up to 40% improvements and no regressions. Tested with 12 cpu x86 server and tip 3.10.0-rc7. The percentages in the final column highlight the areas with the biggest wins, all other areas improved as well: pgbench base smart | db_size | clients | tps | | tps | +---------+---------+-------+ +-------+ | 22 MB | 1 | 10598 | | 10796 | | 22 MB | 2 | 21257 | | 21336 | | 22 MB | 4 | 41386 | | 41622 | | 22 MB | 8 | 51253 | | 57932 | | 22 MB | 12 | 48570 | | 54000 | | 22 MB | 16 | 46748 | | 55982 | +19.75% | 22 MB | 24 | 44346 | | 55847 | +25.93% | 22 MB | 32 | 43460 | | 54614 | +25.66% | 7484 MB | 1 | 8951 | | 9193 | | 7484 MB | 2 | 19233 | | 19240 | | 7484 MB | 4 | 37239 | | 37302 | | 7484 MB | 8 | 46087 | | 50018 | | 7484 MB | 12 | 42054 | | 48763 | | 7484 MB | 16 | 40765 | | 51633 | +26.66% | 7484 MB | 24 | 37651 | | 52377 | +39.11% | 7484 MB | 32 | 37056 | | 51108 | +37.92% | 15 GB | 1 | 8845 | | 9104 | | 15 GB | 2 | 19094 | | 19162 | | 15 GB | 4 | 36979 | | 36983 | | 15 GB | 8 | 46087 | | 49977 | | 15 GB | 12 | 41901 | | 48591 | | 15 GB | 16 | 40147 | | 50651 | +26.16% | 15 GB | 24 | 37250 | | 52365 | +40.58% | 15 GB | 32 | 36470 | | 50015 | +37.14% Signed-off-by: Michael Wang Cc: Mike Galbraith Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51D50057.9000809@linux.vnet.ibm.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 +++ kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 50d04b92ceda..4f163a8ffabf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1034,6 +1034,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; + struct task_struct *last_wakee; + unsigned long wakee_flips; + unsigned long wakee_flip_decay_ts; #endif int on_rq; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 765d87acdf05..860063a8c849 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3017,6 +3017,23 @@ static unsigned long cpu_avg_load_per_task(int cpu) return 0; } +static void record_wakee(struct task_struct *p) +{ + /* + * Rough decay (wiping) for cost saving, don't worry + * about the boundary, really active task won't care + * about the loss. + */ + if (jiffies > current->wakee_flip_decay_ts + HZ) { + current->wakee_flips = 0; + current->wakee_flip_decay_ts = jiffies; + } + + if (current->last_wakee != p) { + current->last_wakee = p; + current->wakee_flips++; + } +} static void task_waking_fair(struct task_struct *p) { @@ -3037,6 +3054,7 @@ static void task_waking_fair(struct task_struct *p) #endif se->vruntime -= min_vruntime; + record_wakee(p); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -3155,6 +3173,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, #endif +static int wake_wide(struct task_struct *p) +{ + int factor = nr_cpus_node(cpu_to_node(smp_processor_id())); + + /* + * Yeah, it's the switching-frequency, could means many wakee or + * rapidly switch, use factor here will just help to automatically + * adjust the loose-degree, so bigger node will lead to more pull. + */ + if (p->wakee_flips > factor) { + /* + * wakee is somewhat hot, it needs certain amount of cpu + * resource, so if waker is far more hot, prefer to leave + * it alone. + */ + if (current->wakee_flips > (factor * p->wakee_flips)) + return 1; + } + + return 0; +} + static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) { s64 this_load, load; @@ -3164,6 +3204,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long weight; int balanced; + /* + * If we wake multiple tasks be careful to not bounce + * ourselves around too much. + */ + if (wake_wide(p)) + return 0; + idx = sd->wake_idx; this_cpu = smp_processor_id(); prev_cpu = task_cpu(p); -- cgit v1.3-8-gc7d7 From 7d9ffa8961482232d964173cccba6e14d2d543b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Jul 2013 12:56:46 +0800 Subject: sched: Micro-optimize the smart wake-affine logic Smart wake-affine is using node-size as the factor currently, but the overhead of the mask operation is high. Thus, this patch introduce the 'sd_llc_size' percpu variable, which will record the highest cache-share domain size, and make it to be the new factor, in order to reduce the overhead and make it more reasonable. Tested-by: Davidlohr Bueso Tested-by: Michael Wang Signed-off-by: Peter Zijlstra Acked-by: Michael Wang Cc: Mike Galbraith Link: http://lkml.kernel.org/r/51D5008E.6030102@linux.vnet.ibm.com [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 7 ++++++- kernel/sched/fair.c | 2 +- kernel/sched/sched.h | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7c32cb7bfeb..6df0fbe53767 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5083,18 +5083,23 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) * two cpus are in the same cache domain, see cpus_share_cache(). */ DEFINE_PER_CPU(struct sched_domain *, sd_llc); +DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); static void update_top_cache_domain(int cpu) { struct sched_domain *sd; int id = cpu; + int size = 1; sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES); - if (sd) + if (sd) { id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + } rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 860063a8c849..f237437446e5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3175,7 +3175,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu, static int wake_wide(struct task_struct *p) { - int factor = nr_cpus_node(cpu_to_node(smp_processor_id())); + int factor = this_cpu_read(sd_llc_size); /* * Yeah, it's the switching-frequency, could means many wakee or diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5e129efb84ce..4c1cb8029feb 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -594,6 +594,7 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) } DECLARE_PER_CPU(struct sched_domain *, sd_llc); +DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); struct sched_group_power { -- cgit v1.3-8-gc7d7 From 2b4883972271f8d61de67aa365ade89dfff69db1 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 24 Jul 2013 11:25:17 -0700 Subject: mutex: Avoid label warning when !CONFIG_MUTEX_SPIN_ON_OWNER Fengguang reported the following warning when optimistic spinning is disabled (ie: make allnoconfig): kernel/mutex.c:599:1: warning: label 'done' defined but not used Remove the 'done' label altogether. Reported-by: Fengguang Wu Signed-off-by: Davidlohr Bueso Signed-off-by: Ingo Molnar --- kernel/mutex.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 386ad5da47a5..98164a55a4dc 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -489,7 +489,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_set_owner(lock); mspin_unlock(MLOCK(lock), &node); - goto done; + preempt_enable(); + return 0; } mspin_unlock(MLOCK(lock), &node); @@ -596,7 +597,6 @@ skip_wait: } spin_unlock_mutex(&lock->wait_lock, flags); -done: preempt_enable(); return 0; -- cgit v1.3-8-gc7d7 From 3831261eb08557a1390b29c1038a0217232d8fdb Mon Sep 17 00:00:00 2001 From: "Brandt, Todd E" Date: Thu, 11 Jul 2013 07:44:35 +0000 Subject: PM / Sleep: increase ftrace coverage in suspend/resume Change where ftrace is disabled and re-enabled during system suspend/resume to allow tracing of device driver pm callbacks. Ftrace will now be turned off when suspend reaches disable_nonboot_cpus() instead of at the very beginning of system suspend. Ftrace was disabled during suspend/resume back in 2008 by Steven Rostedt as he discovered there was a conflict in the enable_nonboot_cpus() call (see commit f42ac38 "ftrace: disable tracing for suspend to ram"). This change preserves his fix by disabling ftrace, but only at the function where it is known to cause problems. The new change allows tracing of the device level code for better debug. [rjw: Changelog] Signed-off-by: Todd Brandt Signed-off-by: Rafael J. Wysocki --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index ece04223bb1e..62ee437b5c7e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -210,6 +210,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) goto Platform_wake; } + ftrace_stop(); error = disable_nonboot_cpus(); if (error || suspend_test(TEST_CPUS)) goto Enable_cpus; @@ -232,6 +233,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) Enable_cpus: enable_nonboot_cpus(); + ftrace_start(); Platform_wake: if (need_suspend_ops(state) && suspend_ops->wake) @@ -265,7 +267,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - ftrace_stop(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -285,7 +286,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - ftrace_start(); resume_console(); Close: if (need_suspend_ops(state) && suspend_ops->end) -- cgit v1.3-8-gc7d7 From 102c9323c35a83789ad5ebd3c45fa8fb389add88 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Jul 2013 17:07:27 -0400 Subject: tracing: Add __tracepoint_string() to export string pointers There are several tracepoints (mostly in RCU), that reference a string pointer and uses the print format of "%s" to display the string that exists in the kernel, instead of copying the actual string to the ring buffer (saves time and ring buffer space). But this has an issue with userspace tools that read the binary buffers that has the address of the string but has no access to what the string itself is. The end result is just output that looks like: rcu_dyntick: ffffffff818adeaa 1 0 rcu_dyntick: ffffffff818adeb5 0 140000000000000 rcu_dyntick: ffffffff818adeb5 0 140000000000000 rcu_utilization: ffffffff8184333b rcu_utilization: ffffffff8184333b The above is pretty useless when read by the userspace tools. Ideally we would want something that looks like this: rcu_dyntick: Start 1 0 rcu_dyntick: End 0 140000000000000 rcu_dyntick: Start 140000000000000 0 rcu_callback: rcu_preempt rhp=0xffff880037aff710 func=put_cred_rcu 0/4 rcu_callback: rcu_preempt rhp=0xffff880078961980 func=file_free_rcu 0/5 rcu_dyntick: End 0 1 The trace_printk() which also only stores the address of the string format instead of recording the string into the buffer itself, exports the mapping of kernel addresses to format strings via the printk_format file in the debugfs tracing directory. The tracepoint strings can use this same method and output the format to the same file and the userspace tools will be able to decipher the address without any modification. The tracepoint strings need its own section to save the strings because the trace_printk section will cause the trace_printk() buffers to be allocated if anything exists within the section. trace_printk() is only used for debugging and should never exist in the kernel, we can not use the trace_printk sections. Add a new tracepoint_str section that will also be examined by the output of the printk_format file. Cc: Paul E. McKenney Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 7 ++++++- include/linux/ftrace_event.h | 34 ++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 3 +++ kernel/trace/trace_printk.c | 19 +++++++++++++++++++ 4 files changed, 62 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 69732d279e8b..83e2c31e8b00 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -122,8 +122,12 @@ #define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .; \ *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \ VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .; +#define TRACEPOINT_STR() VMLINUX_SYMBOL(__start___tracepoint_str) = .; \ + *(__tracepoint_str) /* Trace_printk fmt' pointer */ \ + VMLINUX_SYMBOL(__stop___tracepoint_str) = .; #else #define TRACE_PRINTKS() +#define TRACEPOINT_STR() #endif #ifdef CONFIG_FTRACE_SYSCALLS @@ -190,7 +194,8 @@ VMLINUX_SYMBOL(__stop___verbose) = .; \ LIKELY_PROFILE() \ BRANCH_PROFILE() \ - TRACE_PRINTKS() + TRACE_PRINTKS() \ + TRACEPOINT_STR() /* * Data section helpers diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 4372658c73ae..81af18a75f4d 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -357,6 +357,40 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) +/** + * tracepoint_string - register constant persistent string to trace system + * @str - a constant persistent string that will be referenced in tracepoints + * + * If constant strings are being used in tracepoints, it is faster and + * more efficient to just save the pointer to the string and reference + * that with a printf "%s" instead of saving the string in the ring buffer + * and wasting space and time. + * + * The problem with the above approach is that userspace tools that read + * the binary output of the trace buffers do not have access to the string. + * Instead they just show the address of the string which is not very + * useful to users. + * + * With tracepoint_string(), the string will be registered to the tracing + * system and exported to userspace via the debugfs/tracing/printk_formats + * file that maps the string address to the string text. This way userspace + * tools that read the binary buffers have a way to map the pointers to + * the ASCII strings they represent. + * + * The @str used must be a constant string and persistent as it would not + * make sense to show a string that no longer exists. But it is still fine + * to be used with modules, because when modules are unloaded, if they + * had tracepoints, the ring buffers are cleared too. As long as the string + * does not change during the life of the module, it is fine to use + * tracepoint_string() within a module. + */ +#define tracepoint_string(str) \ + ({ \ + static const char *___tp_str __tracepoint_string = str; \ + ___tp_str; \ + }) +#define __tracepoint_string __attribute__((section("__tracepoint_str"))) + #ifdef CONFIG_PERF_EVENTS struct perf_event; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4a4f6e1828b6..ba321f12df8c 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -1022,6 +1022,9 @@ extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; +extern const char *__start___tracepoint_str[]; +extern const char *__stop___tracepoint_str[]; + void trace_printk_init_buffers(void); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index a9077c1b4ad3..2900817ba65c 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -244,12 +244,31 @@ static const char **find_next(void *v, loff_t *pos) { const char **fmt = v; int start_index; + int last_index; start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt; if (*pos < start_index) return __start___trace_bprintk_fmt + *pos; + /* + * The __tracepoint_str section is treated the same as the + * __trace_printk_fmt section. The difference is that the + * __trace_printk_fmt section should only be used by trace_printk() + * in a debugging environment, as if anything exists in that section + * the trace_prink() helper buffers are allocated, which would just + * waste space in a production environment. + * + * The __tracepoint_str sections on the other hand are used by + * tracepoints which need to map pointers to their strings to + * the ASCII text for userspace. + */ + last_index = start_index; + start_index = __stop___tracepoint_str - __start___tracepoint_str; + + if (*pos < last_index + start_index) + return __start___tracepoint_str + (*pos - last_index); + return find_next_mod_format(start_index, v, fmt, pos); } -- cgit v1.3-8-gc7d7 From 9ad9d25a1ec82d6e52d687348e8cdd4942e7d393 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Sat, 27 Jul 2013 11:56:49 +0800 Subject: cpuset: get rid of the useless forward declaration of cpuset get rid of the useless forward declaration of the struct cpuset cause the below define it. Signed-off-by: Zhao Hongjiang Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index e5657788fedd..2ddd9b93feaa 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -70,7 +70,6 @@ int number_of_cpusets __read_mostly; /* Forward declare cgroup structures */ struct cgroup_subsys cpuset_subsys; -struct cpuset; /* See "Frequency meter" comments, below. */ -- cgit v1.3-8-gc7d7 From 0b9e6965add0701e5cbf56d5bab6d9181e948359 Mon Sep 17 00:00:00 2001 From: Zhao Hongjiang Date: Sat, 27 Jul 2013 11:56:53 +0800 Subject: cpuset: relocate a misplaced comment Comment for cpuset_css_offline() was on top of cpuset_css_free(). Move it. Signed-off-by: Zhao Hongjiang Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2ddd9b93feaa..703bfd5a32a9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2020,6 +2020,12 @@ out_unlock: return 0; } +/* + * If the cpuset being removed has its flag 'sched_load_balance' + * enabled, then simulate turning sched_load_balance off, which + * will call rebuild_sched_domains_locked(). + */ + static void cpuset_css_offline(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); @@ -2035,12 +2041,6 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -/* - * If the cpuset being removed has its flag 'sched_load_balance' - * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains_locked(). - */ - static void cpuset_css_free(struct cgroup *cgrp) { struct cpuset *cs = cgroup_cs(cgrp); -- cgit v1.3-8-gc7d7 From e66c33d579ea566d10e8c8695a7168aae3e02992 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Jul 2013 16:50:28 -0400 Subject: rcu: Add const annotation to char * for RCU tracepoints and functions All the RCU tracepoints and functions that reference char pointers do so with just 'char *' even though they do not modify the contents of the string itself. This will cause warnings if a const char * is used in one of these functions. The RCU tracepoints store the pointer to the string to refer back to them when the trace output is displayed. As this can be minutes, hours or even days later, those strings had better be constant. This change also opens the door to allow the RCU tracepoint strings and their addresses to be exported so that userspace tracing tools can translate the contents of the pointers of the RCU tracepoints. Signed-off-by: Steven Rostedt --- include/linux/rcupdate.h | 4 +-- include/trace/events/rcu.h | 82 +++++++++++++++++++++++----------------------- kernel/rcu.h | 2 +- kernel/rcupdate.c | 2 +- kernel/rcutiny.c | 2 +- kernel/rcutiny_plugin.h | 2 +- kernel/rcutorture.c | 8 ++--- kernel/rcutree.c | 4 +-- kernel/rcutree.h | 2 +- 9 files changed, 54 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 4b14bdc911d7..0c38abbe6e35 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,7 +52,7 @@ extern int rcutorture_runnable; /* for sysctl */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) extern void rcutorture_record_test_transition(void); extern void rcutorture_record_progress(unsigned long vernum); -extern void do_trace_rcu_torture_read(char *rcutorturename, +extern void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, @@ -65,7 +65,7 @@ static inline void rcutorture_record_progress(unsigned long vernum) { } #ifdef CONFIG_RCU_TRACE -extern void do_trace_rcu_torture_read(char *rcutorturename, +extern void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 59ebcc89f148..ee2376cfaab3 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -19,12 +19,12 @@ */ TRACE_EVENT(rcu_utilization, - TP_PROTO(char *s), + TP_PROTO(const char *s), TP_ARGS(s), TP_STRUCT__entry( - __field(char *, s) + __field(const char *, s) ), TP_fast_assign( @@ -51,14 +51,14 @@ TRACE_EVENT(rcu_utilization, */ TRACE_EVENT(rcu_grace_period, - TP_PROTO(char *rcuname, unsigned long gpnum, char *gpevent), + TP_PROTO(const char *rcuname, unsigned long gpnum, const char *gpevent), TP_ARGS(rcuname, gpnum, gpevent), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) - __field(char *, gpevent) + __field(const char *, gpevent) ), TP_fast_assign( @@ -89,21 +89,21 @@ TRACE_EVENT(rcu_grace_period, */ TRACE_EVENT(rcu_future_grace_period, - TP_PROTO(char *rcuname, unsigned long gpnum, unsigned long completed, + TP_PROTO(const char *rcuname, unsigned long gpnum, unsigned long completed, unsigned long c, u8 level, int grplo, int grphi, - char *gpevent), + const char *gpevent), TP_ARGS(rcuname, gpnum, completed, c, level, grplo, grphi, gpevent), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) __field(unsigned long, completed) __field(unsigned long, c) __field(u8, level) __field(int, grplo) __field(int, grphi) - __field(char *, gpevent) + __field(const char *, gpevent) ), TP_fast_assign( @@ -132,13 +132,13 @@ TRACE_EVENT(rcu_future_grace_period, */ TRACE_EVENT(rcu_grace_period_init, - TP_PROTO(char *rcuname, unsigned long gpnum, u8 level, + TP_PROTO(const char *rcuname, unsigned long gpnum, u8 level, int grplo, int grphi, unsigned long qsmask), TP_ARGS(rcuname, gpnum, level, grplo, grphi, qsmask), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) __field(u8, level) __field(int, grplo) @@ -168,12 +168,12 @@ TRACE_EVENT(rcu_grace_period_init, */ TRACE_EVENT(rcu_preempt_task, - TP_PROTO(char *rcuname, int pid, unsigned long gpnum), + TP_PROTO(const char *rcuname, int pid, unsigned long gpnum), TP_ARGS(rcuname, pid, gpnum), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) __field(int, pid) ), @@ -195,12 +195,12 @@ TRACE_EVENT(rcu_preempt_task, */ TRACE_EVENT(rcu_unlock_preempted_task, - TP_PROTO(char *rcuname, unsigned long gpnum, int pid), + TP_PROTO(const char *rcuname, unsigned long gpnum, int pid), TP_ARGS(rcuname, gpnum, pid), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) __field(int, pid) ), @@ -224,14 +224,14 @@ TRACE_EVENT(rcu_unlock_preempted_task, */ TRACE_EVENT(rcu_quiescent_state_report, - TP_PROTO(char *rcuname, unsigned long gpnum, + TP_PROTO(const char *rcuname, unsigned long gpnum, unsigned long mask, unsigned long qsmask, u8 level, int grplo, int grphi, int gp_tasks), TP_ARGS(rcuname, gpnum, mask, qsmask, level, grplo, grphi, gp_tasks), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) __field(unsigned long, mask) __field(unsigned long, qsmask) @@ -268,15 +268,15 @@ TRACE_EVENT(rcu_quiescent_state_report, */ TRACE_EVENT(rcu_fqs, - TP_PROTO(char *rcuname, unsigned long gpnum, int cpu, char *qsevent), + TP_PROTO(const char *rcuname, unsigned long gpnum, int cpu, const char *qsevent), TP_ARGS(rcuname, gpnum, cpu, qsevent), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(unsigned long, gpnum) __field(int, cpu) - __field(char *, qsevent) + __field(const char *, qsevent) ), TP_fast_assign( @@ -308,12 +308,12 @@ TRACE_EVENT(rcu_fqs, */ TRACE_EVENT(rcu_dyntick, - TP_PROTO(char *polarity, long long oldnesting, long long newnesting), + TP_PROTO(const char *polarity, long long oldnesting, long long newnesting), TP_ARGS(polarity, oldnesting, newnesting), TP_STRUCT__entry( - __field(char *, polarity) + __field(const char *, polarity) __field(long long, oldnesting) __field(long long, newnesting) ), @@ -352,12 +352,12 @@ TRACE_EVENT(rcu_dyntick, */ TRACE_EVENT(rcu_prep_idle, - TP_PROTO(char *reason), + TP_PROTO(const char *reason), TP_ARGS(reason), TP_STRUCT__entry( - __field(char *, reason) + __field(const char *, reason) ), TP_fast_assign( @@ -376,13 +376,13 @@ TRACE_EVENT(rcu_prep_idle, */ TRACE_EVENT(rcu_callback, - TP_PROTO(char *rcuname, struct rcu_head *rhp, long qlen_lazy, + TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy, long qlen), TP_ARGS(rcuname, rhp, qlen_lazy, qlen), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(void *, rhp) __field(void *, func) __field(long, qlen_lazy) @@ -412,13 +412,13 @@ TRACE_EVENT(rcu_callback, */ TRACE_EVENT(rcu_kfree_callback, - TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset, + TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, long qlen_lazy, long qlen), TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) __field(long, qlen_lazy) @@ -447,12 +447,12 @@ TRACE_EVENT(rcu_kfree_callback, */ TRACE_EVENT(rcu_batch_start, - TP_PROTO(char *rcuname, long qlen_lazy, long qlen, long blimit), + TP_PROTO(const char *rcuname, long qlen_lazy, long qlen, long blimit), TP_ARGS(rcuname, qlen_lazy, qlen, blimit), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(long, qlen_lazy) __field(long, qlen) __field(long, blimit) @@ -477,12 +477,12 @@ TRACE_EVENT(rcu_batch_start, */ TRACE_EVENT(rcu_invoke_callback, - TP_PROTO(char *rcuname, struct rcu_head *rhp), + TP_PROTO(const char *rcuname, struct rcu_head *rhp), TP_ARGS(rcuname, rhp), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(void *, rhp) __field(void *, func) ), @@ -506,12 +506,12 @@ TRACE_EVENT(rcu_invoke_callback, */ TRACE_EVENT(rcu_invoke_kfree_callback, - TP_PROTO(char *rcuname, struct rcu_head *rhp, unsigned long offset), + TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset), TP_ARGS(rcuname, rhp, offset), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(void *, rhp) __field(unsigned long, offset) ), @@ -539,13 +539,13 @@ TRACE_EVENT(rcu_invoke_kfree_callback, */ TRACE_EVENT(rcu_batch_end, - TP_PROTO(char *rcuname, int callbacks_invoked, + TP_PROTO(const char *rcuname, int callbacks_invoked, bool cb, bool nr, bool iit, bool risk), TP_ARGS(rcuname, callbacks_invoked, cb, nr, iit, risk), TP_STRUCT__entry( - __field(char *, rcuname) + __field(const char *, rcuname) __field(int, callbacks_invoked) __field(bool, cb) __field(bool, nr) @@ -577,13 +577,13 @@ TRACE_EVENT(rcu_batch_end, */ TRACE_EVENT(rcu_torture_read, - TP_PROTO(char *rcutorturename, struct rcu_head *rhp, + TP_PROTO(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c), TP_ARGS(rcutorturename, rhp, secs, c_old, c), TP_STRUCT__entry( - __field(char *, rcutorturename) + __field(const char *, rcutorturename) __field(struct rcu_head *, rhp) __field(unsigned long, secs) __field(unsigned long, c_old) @@ -623,13 +623,13 @@ TRACE_EVENT(rcu_torture_read, */ TRACE_EVENT(rcu_barrier, - TP_PROTO(char *rcuname, char *s, int cpu, int cnt, unsigned long done), + TP_PROTO(const char *rcuname, const char *s, int cpu, int cnt, unsigned long done), TP_ARGS(rcuname, s, cpu, cnt, done), TP_STRUCT__entry( - __field(char *, rcuname) - __field(char *, s) + __field(const char *, rcuname) + __field(const char *, s) __field(int, cpu) __field(int, cnt) __field(unsigned long, done) diff --git a/kernel/rcu.h b/kernel/rcu.h index 7f8e7590e3e5..0a90ccc65bfb 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -94,7 +94,7 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) extern void kfree(const void *); -static inline bool __rcu_reclaim(char *rn, struct rcu_head *head) +static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) { unsigned long offset = (unsigned long)head->func; diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index cce6ba8bbace..14994d4e1a54 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -377,7 +377,7 @@ EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) -void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp, +void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, unsigned long c_old, unsigned long c) { diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c index aa344111de3e..9ed6075dc562 100644 --- a/kernel/rcutiny.c +++ b/kernel/rcutiny.c @@ -264,7 +264,7 @@ void rcu_check_callbacks(int cpu, int user) */ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) { - char *rn = NULL; + const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; RCU_TRACE(int cb_count = 0); diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h index 0cd385acccfa..280d06cae352 100644 --- a/kernel/rcutiny_plugin.h +++ b/kernel/rcutiny_plugin.h @@ -36,7 +36,7 @@ struct rcu_ctrlblk { RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ - RCU_TRACE(char *name); /* Name of RCU type. */ + RCU_TRACE(const char *name); /* Name of RCU type. */ }; /* Definition for rcupdate control block. */ diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index f4871e52c546..3d936f0fbcd8 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -267,7 +267,7 @@ rcutorture_shutdown_notify(struct notifier_block *unused1, * Absorb kthreads into a kernel function that won't return, so that * they won't ever access module text or data again. */ -static void rcutorture_shutdown_absorb(char *title) +static void rcutorture_shutdown_absorb(const char *title) { if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { pr_notice( @@ -337,7 +337,7 @@ rcu_random(struct rcu_random_state *rrsp) } static void -rcu_stutter_wait(char *title) +rcu_stutter_wait(const char *title) { while (stutter_pause_test || !rcutorture_runnable) { if (rcutorture_runnable) @@ -366,7 +366,7 @@ struct rcu_torture_ops { int (*stats)(char *page); int irq_capable; int can_boost; - char *name; + const char *name; }; static struct rcu_torture_ops *cur_ops; @@ -1364,7 +1364,7 @@ rcu_torture_stutter(void *arg) } static inline void -rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag) +rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) { pr_alert("%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 068de3a93606..30201494560b 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1032,7 +1032,7 @@ static unsigned long rcu_cbs_completed(struct rcu_state *rsp, * rcu_nocb_wait_gp(). */ static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long c, char *s) + unsigned long c, const char *s) { trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, rnp->completed, c, rnp->level, @@ -2720,7 +2720,7 @@ static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) * Helper function for _rcu_barrier() tracing. If tracing is disabled, * the compiler is expected to optimize this away. */ -static void _rcu_barrier_trace(struct rcu_state *rsp, char *s, +static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, int cpu, unsigned long done) { trace_rcu_barrier(rsp->name, s, cpu, diff --git a/kernel/rcutree.h b/kernel/rcutree.h index b3832581043c..cbdeac6cea9e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -445,7 +445,7 @@ struct rcu_state { /* for CPU stalls. */ unsigned long gp_max; /* Maximum GP duration in */ /* jiffies. */ - char *name; /* Name of structure. */ + const char *name; /* Name of structure. */ char abbr; /* Abbreviated name. */ struct list_head flavors; /* List of RCU flavors. */ struct irq_work wakeup_work; /* Postponed wakeups */ -- cgit v1.3-8-gc7d7 From a41bfeb2f8ed59410be7ca0f8fbc6138a758b746 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Jul 2013 17:00:28 -0400 Subject: rcu: Simplify RCU_STATE_INITIALIZER() macro The RCU_STATE_INITIALIZER() macro is used only in the rcutree.c file as well as the rcutree_plugin.h file. It is passed as a rvalue to a variable of a similar name. A per_cpu variable is also created with a similar name as well. The uses of RCU_STATE_INITIALIZER() can be simplified to remove some of the duplicate code that is done. Currently the three users of this macro has this format: struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); Notice that "rcu_sched" is called three times. This is the same with the other two users. This can be condensed to just: RCU_STATE_INITIALIZER(rcu_sched, call_rcu_sched); by moving the rest into the macro itself. This also opens the door to allow the RCU tracepoint strings and their addresses to be exported so that userspace tracing tools can translate the contents of the pointers of the RCU tracepoints. The change will allow for helper code to be placed in the RCU_STATE_INITIALIZER() macro to export the name that is used. Signed-off-by: Steven Rostedt --- kernel/rcutree.c | 14 ++++++-------- kernel/rcutree_plugin.h | 4 +--- 2 files changed, 7 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 30201494560b..97994a329d80 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -64,7 +64,8 @@ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -#define RCU_STATE_INITIALIZER(sname, sabbr, cr) { \ +#define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ @@ -77,14 +78,11 @@ static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = #sname, \ .abbr = sabbr, \ -} - -struct rcu_state rcu_sched_state = - RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); -DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); +}; \ +DEFINE_PER_CPU(struct rcu_data, sname##_data) -struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); -DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); +RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); +RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *rcu_state; LIST_HEAD(rcu_struct_flavors); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 769e12e3151b..6976a7dde874 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -110,9 +110,7 @@ static void __init rcu_bootup_announce_oddness(void) #ifdef CONFIG_TREE_PREEMPT_RCU -struct rcu_state rcu_preempt_state = - RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); -DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data); +RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); static struct rcu_state *rcu_state = &rcu_preempt_state; static int rcu_preempted_readers_exp(struct rcu_node *rnp); -- cgit v1.3-8-gc7d7 From f7f7bac9cb1c50783f15937a11743655a5756a36 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 12 Jul 2013 17:18:47 -0400 Subject: rcu: Have the RCU tracepoints use the tracepoint_string infrastructure Currently, RCU tracepoints save only a pointer to strings in the ring buffer. When displayed via the /sys/kernel/debug/tracing/trace file they are referenced like the printf "%s" that looks at the address in the ring buffer and prints out the string it points too. This requires that the strings are constant and persistent in the kernel. The problem with this is for tools like trace-cmd and perf that read the binary data from the buffers but have no access to the kernel memory to find out what string is represented by the address in the buffer. By using the tracepoint_string infrastructure, the RCU tracepoint strings can be exported such that userspace tools can map the addresses to the strings. # cat /sys/kernel/debug/tracing/printk_formats 0xffffffff81a4a0e8 : "rcu_preempt" 0xffffffff81a4a0f4 : "rcu_bh" 0xffffffff81a4a100 : "rcu_sched" 0xffffffff818437a0 : "cpuqs" 0xffffffff818437a6 : "rcu_sched" 0xffffffff818437a0 : "cpuqs" 0xffffffff818437b0 : "rcu_bh" 0xffffffff818437b7 : "Start context switch" 0xffffffff818437cc : "End context switch" 0xffffffff818437a0 : "cpuqs" [...] Now userspaces tools can display: rcu_utilization: Start context switch rcu_dyntick: Start 1 0 rcu_utilization: End context switch rcu_batch_start: rcu_preempt CBs=0/5 bl=10 rcu_dyntick: End 0 140000000000000 rcu_invoke_callback: rcu_preempt rhp=0xffff880071c0d600 func=proc_i_callback rcu_invoke_callback: rcu_preempt rhp=0xffff880077b5b230 func=__d_free rcu_dyntick: Start 140000000000000 0 rcu_invoke_callback: rcu_preempt rhp=0xffff880077563980 func=file_free_rcu rcu_batch_end: rcu_preempt CBs-invoked=3 idle=>c<>c<>c<>c< rcu_utilization: End RCU core rcu_grace_period: rcu_preempt 9741 start rcu_dyntick: Start 1 0 rcu_dyntick: End 0 140000000000000 rcu_dyntick: Start 140000000000000 0 Instead of: rcu_utilization: ffffffff81843110 rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f32 rcu_batch_start: ffffffff81842f1d CBs=0/4 bl=10 rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f3c rcu_grace_period: ffffffff81842f1d 9939 ffffffff81842f80 rcu_invoke_callback: ffffffff81842f1d rhp=0xffff88007888aac0 func=file_free_rcu rcu_grace_period: ffffffff81842f1d 9939 ffffffff81842f95 rcu_invoke_callback: ffffffff81842f1d rhp=0xffff88006aeb4600 func=proc_i_callback rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f32 rcu_future_grace_period: ffffffff81842f1d 9939 9939 9940 0 0 3 ffffffff81842f3c rcu_invoke_callback: ffffffff81842f1d rhp=0xffff880071cb9fc0 func=__d_free rcu_grace_period: ffffffff81842f1d 9939 ffffffff81842f80 rcu_invoke_callback: ffffffff81842f1d rhp=0xffff88007888ae80 func=file_free_rcu rcu_batch_end: ffffffff81842f1d CBs-invoked=4 idle=>c<>c<>c<>c< rcu_utilization: ffffffff8184311f Signed-off-by: Steven Rostedt --- kernel/rcutree.c | 87 ++++++++++++++++++++++++++++++------------------- kernel/rcutree_plugin.h | 32 +++++++++--------- 2 files changed, 69 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 97994a329d80..338f1d1c1c66 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -53,18 +53,36 @@ #include #include #include +#include #include "rcutree.h" #include #include "rcu.h" +/* + * Strings used in tracepoints need to be exported via the + * tracing system such that tools like perf and trace-cmd can + * translate the string address pointers to actual text. + */ +#define TPS(x) tracepoint_string(x) + /* Data structures. */ static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; +/* + * In order to export the rcu_state name to the tracing tools, it + * needs to be added in the __tracepoint_string section. + * This requires defining a separate variable tp__varname + * that points to the string being used, and this will allow + * the tracing userspace tools to be able to decipher the string + * address to the matching string. + */ #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ +static char sname##_varname[] = #sname; \ +static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ .call = cr, \ @@ -76,7 +94,7 @@ struct rcu_state sname##_state = { \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ - .name = #sname, \ + .name = sname##_varname, \ .abbr = sabbr, \ }; \ DEFINE_PER_CPU(struct rcu_data, sname##_data) @@ -176,7 +194,7 @@ void rcu_sched_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } @@ -185,7 +203,7 @@ void rcu_bh_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; } @@ -196,10 +214,10 @@ void rcu_bh_qs(int cpu) */ void rcu_note_context_switch(int cpu) { - trace_rcu_utilization("Start context switch"); + trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(cpu); rcu_preempt_note_context_switch(cpu); - trace_rcu_utilization("End context switch"); + trace_rcu_utilization(TPS("End context switch")); } EXPORT_SYMBOL_GPL(rcu_note_context_switch); @@ -343,11 +361,11 @@ static struct rcu_node *rcu_get_root(struct rcu_state *rsp) static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, bool user) { - trace_rcu_dyntick("Start", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on entry: not idle task", oldval, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -477,7 +495,7 @@ void rcu_irq_exit(void) rdtp->dynticks_nesting--; WARN_ON_ONCE(rdtp->dynticks_nesting < 0); if (rdtp->dynticks_nesting) - trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else rcu_eqs_enter_common(rdtp, oldval, true); local_irq_restore(flags); @@ -499,11 +517,11 @@ static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, smp_mb__after_atomic_inc(); /* See above. */ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); rcu_cleanup_after_idle(smp_processor_id()); - trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); if (!user && !is_idle_task(current)) { struct task_struct *idle = idle_task(smp_processor_id()); - trace_rcu_dyntick("Error on exit: not idle task", + trace_rcu_dyntick(TPS("Error on exit: not idle task"), oldval, rdtp->dynticks_nesting); ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", @@ -618,7 +636,7 @@ void rcu_irq_enter(void) rdtp->dynticks_nesting++; WARN_ON_ONCE(rdtp->dynticks_nesting == 0); if (oldval) - trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(rdtp, oldval, true); local_irq_restore(flags); @@ -773,7 +791,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) * of the current RCU grace period. */ if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti"); + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; } @@ -793,7 +811,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) return 0; /* Grace period is not old enough. */ barrier(); if (cpu_is_offline(rdp->cpu)) { - trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl"); + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); rdp->offline_fqs++; return 1; } @@ -1056,9 +1074,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * grace period is already marked as needed, return to the caller. */ c = rcu_cbs_completed(rdp->rsp, rnp); - trace_rcu_future_gp(rnp, rdp, c, "Startleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); if (rnp->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, "Prestartleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); return c; } @@ -1072,7 +1090,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) if (rnp->gpnum != rnp->completed || ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { rnp->need_future_gp[c & 0x1]++; - trace_rcu_future_gp(rnp, rdp, c, "Startedleaf"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); return c; } @@ -1100,7 +1118,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) * recorded, trace and leave. */ if (rnp_root->need_future_gp[c & 0x1]) { - trace_rcu_future_gp(rnp, rdp, c, "Prestartedroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); goto unlock_out; } @@ -1109,9 +1127,9 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) /* If a grace period is not already in progress, start one. */ if (rnp_root->gpnum != rnp_root->completed) { - trace_rcu_future_gp(rnp, rdp, c, "Startedleafroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); } else { - trace_rcu_future_gp(rnp, rdp, c, "Startedroot"); + trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); } unlock_out: @@ -1135,7 +1153,8 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; - trace_rcu_future_gp(rnp, rdp, c, needmore ? "CleanupMore" : "Cleanup"); + trace_rcu_future_gp(rnp, rdp, c, + needmore ? TPS("CleanupMore") : TPS("Cleanup")); return needmore; } @@ -1203,9 +1222,9 @@ static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, /* Trace depending on how much we were able to accelerate. */ if (!*rdp->nxttail[RCU_WAIT_TAIL]) - trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccWaitCB"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else - trace_rcu_grace_period(rsp->name, rdp->gpnum, "AccReadyCB"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); } /* @@ -1271,7 +1290,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc /* Remember that we saw this grace-period completion. */ rdp->completed = rnp->completed; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); } if (rdp->gpnum != rnp->gpnum) { @@ -1281,7 +1300,7 @@ static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struc * go looking for one. */ rdp->gpnum = rnp->gpnum; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); rdp->passed_quiesce = 0; rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); zero_cpu_stall_ticks(rdp); @@ -1324,7 +1343,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Advance to a new grace period and initialize state. */ rsp->gpnum++; - trace_rcu_grace_period(rsp->name, rsp->gpnum, "start"); + trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); record_gp_stall_check_time(rsp); raw_spin_unlock_irq(&rnp->lock); @@ -1446,7 +1465,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_nocb_gp_set(rnp, nocb); rsp->completed = rsp->gpnum; /* Declare grace period done. */ - trace_rcu_grace_period(rsp->name, rsp->completed, "end"); + trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); rsp->fqs_state = RCU_GP_IDLE; rdp = this_cpu_ptr(rsp->rda); rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ @@ -1855,7 +1874,7 @@ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) RCU_TRACE(mask = rdp->grpmask); trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), - "cpuofl"); + TPS("cpuofl")); } /* @@ -2042,7 +2061,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) */ void rcu_check_callbacks(int cpu, int user) { - trace_rcu_utilization("Start scheduler-tick"); + trace_rcu_utilization(TPS("Start scheduler-tick")); increment_cpu_stall_ticks(); if (user || rcu_is_cpu_rrupt_from_idle()) { @@ -2075,7 +2094,7 @@ void rcu_check_callbacks(int cpu, int user) rcu_preempt_check_callbacks(cpu); if (rcu_pending(cpu)) invoke_rcu_core(); - trace_rcu_utilization("End scheduler-tick"); + trace_rcu_utilization(TPS("End scheduler-tick")); } /* @@ -2206,10 +2225,10 @@ static void rcu_process_callbacks(struct softirq_action *unused) if (cpu_is_offline(smp_processor_id())) return; - trace_rcu_utilization("Start RCU core"); + trace_rcu_utilization(TPS("Start RCU core")); for_each_rcu_flavor(rsp) __rcu_process_callbacks(rsp); - trace_rcu_utilization("End RCU core"); + trace_rcu_utilization(TPS("End RCU core")); } /* @@ -2950,7 +2969,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->completed = rnp->completed; rdp->passed_quiesce = 0; rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl"); + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); } raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; @@ -2980,7 +2999,7 @@ static int rcu_cpu_notify(struct notifier_block *self, struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - trace_rcu_utilization("Start CPU hotplug"); + trace_rcu_utilization(TPS("Start CPU hotplug")); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -3009,7 +3028,7 @@ static int rcu_cpu_notify(struct notifier_block *self, default: break; } - trace_rcu_utilization("End CPU hotplug"); + trace_rcu_utilization(TPS("End CPU hotplug")); return NOTIFY_OK; } diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6976a7dde874..dff86f53ee09 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -167,7 +167,7 @@ static void rcu_preempt_qs(int cpu) struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); if (rdp->passed_quiesce == 0) - trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs"); + trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); rdp->passed_quiesce = 1; current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; } @@ -386,7 +386,7 @@ void rcu_read_unlock_special(struct task_struct *t) np = rcu_next_node_entry(t, rnp); list_del_init(&t->rcu_node_entry); t->rcu_blocked_node = NULL; - trace_rcu_unlock_preempted_task("rcu_preempt", + trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), rnp->gpnum, t->pid); if (&t->rcu_node_entry == rnp->gp_tasks) rnp->gp_tasks = np; @@ -410,7 +410,7 @@ void rcu_read_unlock_special(struct task_struct *t) */ empty_exp_now = !rcu_preempted_readers_exp(rnp); if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { - trace_rcu_quiescent_state_report("preempt_rcu", + trace_rcu_quiescent_state_report(TPS("preempt_rcu"), rnp->gpnum, 0, rnp->qsmask, rnp->level, @@ -1248,12 +1248,12 @@ static int rcu_boost_kthread(void *arg) int spincnt = 0; int more2boost; - trace_rcu_utilization("Start boost kthread@init"); + trace_rcu_utilization(TPS("Start boost kthread@init")); for (;;) { rnp->boost_kthread_status = RCU_KTHREAD_WAITING; - trace_rcu_utilization("End boost kthread@rcu_wait"); + trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); rcu_wait(rnp->boost_tasks || rnp->exp_tasks); - trace_rcu_utilization("Start boost kthread@rcu_wait"); + trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; more2boost = rcu_boost(rnp); if (more2boost) @@ -1262,14 +1262,14 @@ static int rcu_boost_kthread(void *arg) spincnt = 0; if (spincnt > 10) { rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("End boost kthread@rcu_yield"); + trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); schedule_timeout_interruptible(2); - trace_rcu_utilization("Start boost kthread@rcu_yield"); + trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); spincnt = 0; } } /* NOTREACHED */ - trace_rcu_utilization("End boost kthread@notreached"); + trace_rcu_utilization(TPS("End boost kthread@notreached")); return 0; } @@ -1417,7 +1417,7 @@ static void rcu_cpu_kthread(unsigned int cpu) int spincnt; for (spincnt = 0; spincnt < 10; spincnt++) { - trace_rcu_utilization("Start CPU kthread@rcu_wait"); + trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); local_bh_disable(); *statusp = RCU_KTHREAD_RUNNING; this_cpu_inc(rcu_cpu_kthread_loops); @@ -1429,15 +1429,15 @@ static void rcu_cpu_kthread(unsigned int cpu) rcu_kthread_do_work(); local_bh_enable(); if (*workp == 0) { - trace_rcu_utilization("End CPU kthread@rcu_wait"); + trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); *statusp = RCU_KTHREAD_WAITING; return; } } *statusp = RCU_KTHREAD_YIELDING; - trace_rcu_utilization("Start CPU kthread@rcu_yield"); + trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); schedule_timeout_interruptible(2); - trace_rcu_utilization("End CPU kthread@rcu_yield"); + trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); *statusp = RCU_KTHREAD_WAITING; } @@ -2200,7 +2200,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) * Wait for the grace period. Do so interruptibly to avoid messing * up the load average. */ - trace_rcu_future_gp(rnp, rdp, c, "StartWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { wait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], @@ -2208,9 +2208,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) if (likely(d)) break; flush_signals(current); - trace_rcu_future_gp(rnp, rdp, c, "ResumeWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); } - trace_rcu_future_gp(rnp, rdp, c, "EndWait"); + trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); smp_mb(); /* Ensure that CB invocation happens after GP end. */ } -- cgit v1.3-8-gc7d7 From db446a08c23d5475e6b08c87acca79ebb20f283c Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 30 Jul 2013 12:54:40 -0400 Subject: aio: convert the ioctx list to table lookup v3 On Wed, Jun 12, 2013 at 11:14:40AM -0700, Kent Overstreet wrote: > On Mon, Apr 15, 2013 at 02:40:55PM +0300, Octavian Purdila wrote: > > When using a large number of threads performing AIO operations the > > IOCTX list may get a significant number of entries which will cause > > significant overhead. For example, when running this fio script: > > > > rw=randrw; size=256k ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=512; thread; loops=100 > > > > on an EXT2 filesystem mounted on top of a ramdisk we can observe up to > > 30% CPU time spent by lookup_ioctx: > > > > 32.51% [guest.kernel] [g] lookup_ioctx > > 9.19% [guest.kernel] [g] __lock_acquire.isra.28 > > 4.40% [guest.kernel] [g] lock_release > > 4.19% [guest.kernel] [g] sched_clock_local > > 3.86% [guest.kernel] [g] local_clock > > 3.68% [guest.kernel] [g] native_sched_clock > > 3.08% [guest.kernel] [g] sched_clock_cpu > > 2.64% [guest.kernel] [g] lock_release_holdtime.part.11 > > 2.60% [guest.kernel] [g] memcpy > > 2.33% [guest.kernel] [g] lock_acquired > > 2.25% [guest.kernel] [g] lock_acquire > > 1.84% [guest.kernel] [g] do_io_submit > > > > This patchs converts the ioctx list to a radix tree. For a performance > > comparison the above FIO script was run on a 2 sockets 8 core > > machine. This are the results (average and %rsd of 10 runs) for the > > original list based implementation and for the radix tree based > > implementation: > > > > cores 1 2 4 8 16 32 > > list 109376 ms 69119 ms 35682 ms 22671 ms 19724 ms 16408 ms > > %rsd 0.69% 1.15% 1.17% 1.21% 1.71% 1.43% > > radix 73651 ms 41748 ms 23028 ms 16766 ms 15232 ms 13787 ms > > %rsd 1.19% 0.98% 0.69% 1.13% 0.72% 0.75% > > % of radix > > relative 66.12% 65.59% 66.63% 72.31% 77.26% 83.66% > > to list > > > > To consider the impact of the patch on the typical case of having > > only one ctx per process the following FIO script was run: > > > > rw=randrw; size=100m ;directory=/mnt/fio; ioengine=libaio; iodepth=1 > > blocksize=1024; numjobs=1; thread; loops=100 > > > > on the same system and the results are the following: > > > > list 58892 ms > > %rsd 0.91% > > radix 59404 ms > > %rsd 0.81% > > % of radix > > relative 100.87% > > to list > > So, I was just doing some benchmarking/profiling to get ready to send > out the aio patches I've got for 3.11 - and it looks like your patch is > causing a ~1.5% throughput regression in my testing :/ ... I've got an alternate approach for fixing this wart in lookup_ioctx()... Instead of using an rbtree, just use the reserved id in the ring buffer header to index an array pointing the ioctx. It's not finished yet, and it needs to be tidied up, but is most of the way there. -ben -- "Thought is the essence of where you are now." -- kmo> And, a rework of Ben's code, but this was entirely his idea kmo> -Kent bcrl> And fix the code to use the right mm_struct in kill_ioctx(), actually free memory. Signed-off-by: Benjamin LaHaise --- fs/aio.c | 136 +++++++++++++++++++++++++++++++++++++++-------- include/linux/mm_types.h | 5 +- kernel/fork.c | 2 +- 3 files changed, 118 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/fs/aio.c b/fs/aio.c index 945dd0d072f3..52f200ebef07 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -66,6 +66,12 @@ struct aio_ring { #define AIO_RING_PAGES 8 +struct kioctx_table { + struct rcu_head rcu; + unsigned nr; + struct kioctx *table[]; +}; + struct kioctx_cpu { unsigned reqs_available; }; @@ -74,9 +80,7 @@ struct kioctx { struct percpu_ref users; atomic_t dead; - /* This needs improving */ unsigned long user_id; - struct hlist_node list; struct __percpu kioctx_cpu *cpu; @@ -135,6 +139,8 @@ struct kioctx { struct page *internal_pages[AIO_RING_PAGES]; struct file *aio_ring_file; + + unsigned id; }; /*------ sysctl variables----*/ @@ -326,7 +332,7 @@ static int aio_setup_ring(struct kioctx *ctx) ring = kmap_atomic(ctx->ring_pages[0]); ring->nr = nr_events; /* user copy */ - ring->id = ctx->user_id; + ring->id = ~0U; ring->head = ring->tail = 0; ring->magic = AIO_RING_MAGIC; ring->compat_features = AIO_RING_COMPAT_FEATURES; @@ -462,6 +468,58 @@ static void free_ioctx_ref(struct percpu_ref *ref) schedule_work(&ctx->free_work); } +static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm) +{ + unsigned i, new_nr; + struct kioctx_table *table, *old; + struct aio_ring *ring; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference(mm->ioctx_table); + + while (1) { + if (table) + for (i = 0; i < table->nr; i++) + if (!table->table[i]) { + ctx->id = i; + table->table[i] = ctx; + spin_unlock(&mm->ioctx_lock); + + ring = kmap_atomic(ctx->ring_pages[0]); + ring->id = ctx->id; + kunmap_atomic(ring); + return 0; + } + + new_nr = (table ? table->nr : 1) * 4; + + spin_unlock(&mm->ioctx_lock); + + table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) * + new_nr, GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->nr = new_nr; + + spin_lock(&mm->ioctx_lock); + old = rcu_dereference(mm->ioctx_table); + + if (!old) { + rcu_assign_pointer(mm->ioctx_table, table); + } else if (table->nr > old->nr) { + memcpy(table->table, old->table, + old->nr * sizeof(struct kioctx *)); + + rcu_assign_pointer(mm->ioctx_table, table); + kfree_rcu(old, rcu); + } else { + kfree(table); + table = old; + } + } +} + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -520,6 +578,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4); BUG_ON(!ctx->req_batch); + err = ioctx_add_table(ctx, mm); + if (err) + goto out_cleanup_noerr; + /* limit the number of system wide aios */ spin_lock(&aio_nr_lock); if (aio_nr + nr_events > (aio_max_nr * 2UL) || @@ -532,17 +594,13 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */ - /* now link into global list. */ - spin_lock(&mm->ioctx_lock); - hlist_add_head_rcu(&ctx->list, &mm->ioctx_list); - spin_unlock(&mm->ioctx_lock); - pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n", ctx, ctx->user_id, mm, ctx->nr_events); return ctx; out_cleanup: err = -EAGAIN; +out_cleanup_noerr: aio_free_ring(ctx); out_freepcpu: free_percpu(ctx->cpu); @@ -561,10 +619,18 @@ out_freectx: * when the processes owning a context have all exited to encourage * the rapid destruction of the kioctx. */ -static void kill_ioctx(struct kioctx *ctx) +static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx) { if (!atomic_xchg(&ctx->dead, 1)) { - hlist_del_rcu(&ctx->list); + struct kioctx_table *table; + + spin_lock(&mm->ioctx_lock); + table = rcu_dereference(mm->ioctx_table); + + WARN_ON(ctx != table->table[ctx->id]); + table->table[ctx->id] = NULL; + spin_unlock(&mm->ioctx_lock); + /* percpu_ref_kill() will do the necessary call_rcu() */ wake_up_all(&ctx->wait); @@ -613,10 +679,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb); */ void exit_aio(struct mm_struct *mm) { + struct kioctx_table *table; struct kioctx *ctx; - struct hlist_node *n; + unsigned i = 0; + + while (1) { + rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); + + do { + if (!table || i >= table->nr) { + rcu_read_unlock(); + rcu_assign_pointer(mm->ioctx_table, NULL); + if (table) + kfree(table); + return; + } + + ctx = table->table[i++]; + } while (!ctx); + + rcu_read_unlock(); - hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) { /* * We don't need to bother with munmap() here - * exit_mmap(mm) is coming and it'll unmap everything. @@ -627,7 +711,7 @@ void exit_aio(struct mm_struct *mm) */ ctx->mmap_size = 0; - kill_ioctx(ctx); + kill_ioctx(mm, ctx); } } @@ -710,19 +794,27 @@ static void kiocb_free(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { + struct aio_ring __user *ring = (void __user *)ctx_id; struct mm_struct *mm = current->mm; struct kioctx *ctx, *ret = NULL; + struct kioctx_table *table; + unsigned id; + + if (get_user(id, &ring->id)) + return NULL; rcu_read_lock(); + table = rcu_dereference(mm->ioctx_table); - hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) { - if (ctx->user_id == ctx_id) { - percpu_ref_get(&ctx->users); - ret = ctx; - break; - } - } + if (!table || id >= table->nr) + goto out; + ctx = table->table[id]; + if (ctx->user_id == ctx_id) { + percpu_ref_get(&ctx->users); + ret = ctx; + } +out: rcu_read_unlock(); return ret; } @@ -998,7 +1090,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp) if (!IS_ERR(ioctx)) { ret = put_user(ioctx->user_id, ctxp); if (ret) - kill_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); percpu_ref_put(&ioctx->users); } @@ -1016,7 +1108,7 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) { struct kioctx *ioctx = lookup_ioctx(ctx); if (likely(NULL != ioctx)) { - kill_ioctx(ioctx); + kill_ioctx(current->mm, ioctx); percpu_ref_put(&ioctx->users); return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fb425aa16c01..da8cf5cc1aa6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -322,6 +322,7 @@ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; }; +struct kioctx_table; struct mm_struct { struct vm_area_struct * mmap; /* list of VMAs */ struct rb_root mm_rb; @@ -382,8 +383,8 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ #ifdef CONFIG_AIO - spinlock_t ioctx_lock; - struct hlist_head ioctx_list; + spinlock_t ioctx_lock; + struct kioctx_table __rcu *ioctx_table; #endif #ifdef CONFIG_MM_OWNER /* diff --git a/kernel/fork.c b/kernel/fork.c index 66635c80a813..db5f541c5488 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -522,7 +522,7 @@ static void mm_init_aio(struct mm_struct *mm) { #ifdef CONFIG_AIO spin_lock_init(&mm->ioctx_lock); - INIT_HLIST_HEAD(&mm->ioctx_list); + mm->ioctx_table = NULL; #endif } -- cgit v1.3-8-gc7d7 From 46591962cb5bfd2bfb0baf42497119c816503598 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Tue, 30 Jul 2013 11:06:09 +0800 Subject: generic-ipi: Kill unnecessary variable - csd_flags After commit 8969a5ede0f9e17da4b943712429aef2c9bcd82b ("generic-ipi: remove kmalloc()"), wait = 0 can be guaranteed, and all callsites of generic_exec_single() do an unconditional csd_lock() now. So csd_flags is unnecessary now. Remove it. Signed-off-by: Xie XiuQi Signed-off-by: Peter Zijlstra Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Nick Piggin Cc: Jens Axboe Cc: "Paul E. McKenney" Cc: Rusty Russell Link: http://lkml.kernel.org/r/51F72DA1.7010401@huawei.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index fe9f773d7114..7332697cd184 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -186,25 +186,13 @@ void generic_smp_call_function_single_interrupt(void) while (!list_empty(&list)) { struct call_single_data *csd; - unsigned int csd_flags; csd = list_entry(list.next, struct call_single_data, list); list_del(&csd->list); - /* - * 'csd' can be invalid after this call if flags == 0 - * (when called through generic_exec_single()), - * so save them away before making the call: - */ - csd_flags = csd->flags; - csd->func(csd->info); - /* - * Unlocked CSDs are valid through generic_exec_single(): - */ - if (csd_flags & CSD_FLAG_LOCK) - csd_unlock(csd); + csd_unlock(csd); } } -- cgit v1.3-8-gc7d7 From 6050cb0b0b366092d1383bc23d7b16cd26db00f0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:30:59 +0200 Subject: perf: Fix branch stack refcount leak on callchain init failure On callchain buffers allocation failure, free_event() is called and all the accounting performed in perf_event_alloc() for that event is cancelled. But if the event has branch stack sampling, it is unaccounted as well from the branch stack sampling events refcounts. This is a bug because this accounting is performed after the callchain buffer allocation. As a result, the branch stack sampling events refcount can become negative. To fix this, move the branch stack event accounting before the callchain buffer allocation. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 127411400116..f35aa7e69e2d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6567,6 +6567,12 @@ done: atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) { @@ -6574,12 +6580,6 @@ done: return ERR_PTR(err); } } - if (has_branch_stack(event)) { - static_key_slow_inc(&perf_sched_events.key); - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, - event->cpu)); - } } return event; -- cgit v1.3-8-gc7d7 From 90983b16078ab0fdc58f0dab3e8e3da79c9579a2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:00 +0200 Subject: perf: Sanitize get_callchain_buffer() In case of allocation failure, get_callchain_buffer() keeps the refcount incremented for the current event. As a result, when get_callchain_buffers() returns an error, we must cleanup what it did by cancelling its last refcount with a call to put_callchain_buffers(). This is a hack in order to be able to call free_event() after that failure. The original purpose of that was to simplify the failure path. But this error handling is actually counter intuitive, ugly and not very easy to follow because one expect to see the resources used to perform a service to be cleaned by the callee if case of failure, not by the caller. So lets clean this up by cancelling the refcount from get_callchain_buffer() in case of failure. And correctly free the event accordingly in perf_event_alloc(). Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/callchain.c | 2 ++ kernel/events/core.c | 41 +++++++++++++++++++++-------------------- 2 files changed, 23 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c77206184b8b..76a8bc5f6265 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -117,6 +117,8 @@ int get_callchain_buffers(void) err = alloc_callchain_buffers(); exit: mutex_unlock(&callchain_mutex); + if (err) + atomic_dec(&nr_callchain_events); return err; } diff --git a/kernel/events/core.c b/kernel/events/core.c index f35aa7e69e2d..3b998626b7a0 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6457,7 +6457,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; - long err; + long err = -EINVAL; if ((unsigned)cpu >= nr_cpu_ids) { if (!task || cpu != -1) @@ -6540,25 +6540,23 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, * we currently do not support PERF_FORMAT_GROUP on inherited events */ if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; + goto err_ns; pmu = perf_init_event(event); - -done: - err = 0; if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) + goto err_ns; + else if (IS_ERR(pmu)) { err = PTR_ERR(pmu); - - if (err) { - if (event->ns) - put_pid_ns(event->ns); - kfree(event); - return ERR_PTR(err); + goto err_ns; } if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) + goto err_pmu; + } + if (event->attach_state & PERF_ATTACH_TASK) static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) @@ -6573,16 +6571,19 @@ done: atomic_inc(&per_cpu(perf_branch_stack_events, event->cpu)); } - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { - err = get_callchain_buffers(); - if (err) { - free_event(event); - return ERR_PTR(err); - } - } } return event; + +err_pmu: + if (event->destroy) + event->destroy(event); +err_ns: + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + + return ERR_PTR(err); } static int perf_copy_attr(struct perf_event_attr __user *uattr, -- cgit v1.3-8-gc7d7 From 766d6c076928191d75ad5b0d0f58f52b1e7682d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:01 +0200 Subject: perf: Factor out event accounting code to account_event()/__free_event() Gather all the event accounting code to a single place, once all the prerequisites are completed. This simplifies the refcounting. Original-patch-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 79 +++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3b998626b7a0..158fd5789e58 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3128,6 +3128,21 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_put(struct ring_buffer *rb); static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void __free_event(struct perf_event *event) +{ + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + call_rcu(&event->rcu_head, free_event_rcu); +} static void free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3141,8 +3156,6 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); if (is_cgroup_event(event)) { atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); static_key_slow_dec_deferred(&perf_sched_events); @@ -3180,13 +3193,8 @@ static void free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - call_rcu(&event->rcu_head, free_event_rcu); + __free_event(event); } int perf_event_release_kernel(struct perf_event *event) @@ -6443,6 +6451,29 @@ unlock: return pmu; } +static void account_event(struct perf_event *event) +{ + if (event->attach_state & PERF_ATTACH_TASK) + static_key_slow_inc(&perf_sched_events.key); + if (event->attr.mmap || event->attr.mmap_data) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + if (has_branch_stack(event)) { + static_key_slow_inc(&perf_sched_events.key); + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, + event->cpu)); + } + + if (is_cgroup_event(event)) { + atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + static_key_slow_inc(&perf_sched_events.key); + } +} + /* * Allocate and initialize a event structure */ @@ -6556,21 +6587,6 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (err) goto err_pmu; } - - if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); - if (event->attr.mmap || event->attr.mmap_data) - atomic_inc(&nr_mmap_events); - if (event->attr.comm) - atomic_inc(&nr_comm_events); - if (event->attr.task) - atomic_inc(&nr_task_events); - if (has_branch_stack(event)) { - static_key_slow_inc(&perf_sched_events.key); - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, - event->cpu)); - } } return event; @@ -6865,17 +6881,14 @@ SYSCALL_DEFINE5(perf_event_open, if (flags & PERF_FLAG_PID_CGROUP) { err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) - goto err_alloc; - /* - * one more event: - * - that has cgroup constraint on event->cpu - * - that may need work on context switch - */ - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); - static_key_slow_inc(&perf_sched_events.key); + if (err) { + __free_event(event); + goto err_task; + } } + account_event(event); + /* * Special case software events and allow them to be part of * any hardware group. @@ -7071,6 +7084,8 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err; } + account_event(event); + ctx = find_get_context(event->pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); -- cgit v1.3-8-gc7d7 From 4beb31f3657348a8b702dd014d01c520e522012f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:02 +0200 Subject: perf: Split the per-cpu accounting part of the event accounting code This way we can use the per-cpu handling seperately. This is going to be used by to fix the event migration code accounting. Original-patch-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-5-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 87 +++++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 158fd5789e58..3a4b73aebc42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3128,6 +3128,40 @@ static void free_event_rcu(struct rcu_head *head) static void ring_buffer_put(struct ring_buffer *rb); static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); +static void unaccount_event_cpu(struct perf_event *event, int cpu) +{ + if (event->parent) + return; + + if (has_branch_stack(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); + } + if (is_cgroup_event(event)) + atomic_dec(&per_cpu(perf_cgroup_events, cpu)); +} + +static void unaccount_event(struct perf_event *event) +{ + if (event->parent) + return; + + if (event->attach_state & PERF_ATTACH_TASK) + static_key_slow_dec_deferred(&perf_sched_events); + if (event->attr.mmap || event->attr.mmap_data) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + if (is_cgroup_event(event)) + static_key_slow_dec_deferred(&perf_sched_events); + if (has_branch_stack(event)) + static_key_slow_dec_deferred(&perf_sched_events); + + unaccount_event_cpu(event, event->cpu); +} + static void __free_event(struct perf_event *event) { if (!event->parent) { @@ -3147,29 +3181,7 @@ static void free_event(struct perf_event *event) { irq_work_sync(&event->pending); - if (!event->parent) { - if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); - if (event->attr.mmap || event->attr.mmap_data) - atomic_dec(&nr_mmap_events); - if (event->attr.comm) - atomic_dec(&nr_comm_events); - if (event->attr.task) - atomic_dec(&nr_task_events); - if (is_cgroup_event(event)) { - atomic_dec(&per_cpu(perf_cgroup_events, event->cpu)); - static_key_slow_dec_deferred(&perf_sched_events); - } - - if (has_branch_stack(event)) { - static_key_slow_dec_deferred(&perf_sched_events); - /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) { - atomic_dec(&per_cpu(perf_branch_stack_events, - event->cpu)); - } - } - } + unaccount_event(event); if (event->rb) { struct ring_buffer *rb; @@ -6451,8 +6463,24 @@ unlock: return pmu; } +static void account_event_cpu(struct perf_event *event, int cpu) +{ + if (event->parent) + return; + + if (has_branch_stack(event)) { + if (!(event->attach_state & PERF_ATTACH_TASK)) + atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); + } + if (is_cgroup_event(event)) + atomic_inc(&per_cpu(perf_cgroup_events, cpu)); +} + static void account_event(struct perf_event *event) { + if (event->parent) + return; + if (event->attach_state & PERF_ATTACH_TASK) static_key_slow_inc(&perf_sched_events.key); if (event->attr.mmap || event->attr.mmap_data) @@ -6461,17 +6489,12 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); - if (has_branch_stack(event)) { + if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, - event->cpu)); - } - - if (is_cgroup_event(event)) { - atomic_inc(&per_cpu(perf_cgroup_events, event->cpu)); + if (is_cgroup_event(event)) static_key_slow_inc(&perf_sched_events.key); - } + + account_event_cpu(event, event->cpu); } /* -- cgit v1.3-8-gc7d7 From 9a545de019b536771feefb76f85e5038b65c2190 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:03 +0200 Subject: perf: Migrate per cpu event accounting When an event is migrated, move the event per-cpu accounting accordingly so that branch stack and cgroup events work correctly on the new CPU. Original-patch-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-6-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3a4b73aebc42..63bdec9fdd21 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7145,6 +7145,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { perf_remove_from_context(event); + unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->event_entry, &events); } @@ -7157,6 +7158,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) list_del(&event->event_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; + account_event_cpu(event, dst_cpu); perf_install_in_context(dst_ctx, event, dst_cpu); get_ctx(dst_ctx); } -- cgit v1.3-8-gc7d7 From ba8a75c16e292c0a3a87406a77508cbbc6cf4ee2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:04 +0200 Subject: perf: Account freq events per cpu This is going to be used by the full dynticks subsystem as a finer-grained information to know when to keep and when to stop the tick. Original-patch-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-7-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 63bdec9fdd21..3fe385aa93e6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -141,6 +141,7 @@ enum event_type_t { struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); +static DEFINE_PER_CPU(atomic_t, perf_freq_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -3139,6 +3140,9 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) } if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); + + if (event->attr.freq) + atomic_dec(&per_cpu(perf_freq_events, cpu)); } static void unaccount_event(struct perf_event *event) @@ -6474,6 +6478,9 @@ static void account_event_cpu(struct perf_event *event, int cpu) } if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); + + if (event->attr.freq) + atomic_inc(&per_cpu(perf_freq_events, cpu)); } static void account_event(struct perf_event *event) -- cgit v1.3-8-gc7d7 From d84153d6c96f61aa06429586284639f32debf03e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:05 +0200 Subject: perf: Implement finer grained full dynticks kick Currently the full dynticks subsystem keep the tick alive as long as there are perf events running. This prevents the tick from being stopped as long as features such that the lockup detectors are running. As a temporary fix, the lockup detector is disabled by default when full dynticks is built but this is not a long term viable solution. To fix this, only keep the tick alive when an event configured with a frequency rather than a period is running on the CPU, or when an event throttles on the CPU. These are the only purposes of the perf tick, especially now that the rotation of flexible events is handled from a seperate hrtimer. The tick can be shutdown the rest of the time. Original-patch-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-8-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 3fe385aa93e6..916cf1f593b4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -870,12 +870,8 @@ static void perf_pmu_rotate_start(struct pmu *pmu) WARN_ON(!irqs_disabled()); - if (list_empty(&cpuctx->rotation_list)) { - int was_empty = list_empty(head); + if (list_empty(&cpuctx->rotation_list)) list_add(&cpuctx->rotation_list, head); - if (was_empty) - tick_nohz_full_kick(); - } } static void get_ctx(struct perf_event_context *ctx) @@ -1875,6 +1871,9 @@ static int __perf_install_in_context(void *info) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, task_ctx); + if (atomic_read(&__get_cpu_var(perf_freq_events))) + tick_nohz_full_kick(); + return 0; } @@ -2812,10 +2811,11 @@ done: #ifdef CONFIG_NO_HZ_FULL bool perf_event_can_stop_tick(void) { - if (list_empty(&__get_cpu_var(rotation_list))) - return true; - else + if (atomic_read(&__get_cpu_var(perf_freq_events)) || + __this_cpu_read(perf_throttled_count)) return false; + else + return true; } #endif @@ -5202,6 +5202,7 @@ static int __perf_event_overflow(struct perf_event *event, __this_cpu_inc(perf_throttled_count); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); + tick_nohz_full_kick(); ret = 1; } } -- cgit v1.3-8-gc7d7 From 93786a5f6aeb9c032c1c240246c5aabcf457b38f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 23 Jul 2013 02:31:06 +0200 Subject: watchdog: Make it work under full dynticks A perf event can be used without forcing the tick to stay alive if it doesn't use a frequency but a sample period and if it doesn't throttle (raise storm of events). Since the lockup detector neither use a perf event frequency nor should ever throttle due to its high period, it can now run concurrently with the full dynticks feature. So remove the hack that disabled the watchdog. Signed-off-by: Frederic Weisbecker Cc: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: Don Zickus Cc: Srivatsa S. Bhat Cc: Anish Singh Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1374539466-4799-9-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 1241d8c91d5e..51c4f34d258e 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -553,14 +553,6 @@ void __init lockup_detector_init(void) { set_sample_period(); -#ifdef CONFIG_NO_HZ_FULL - if (watchdog_user_enabled) { - watchdog_user_enabled = 0; - pr_warning("Disabled lockup detectors by default for full dynticks\n"); - pr_warning("You can reactivate it with 'sysctl -w kernel.watchdog=1'\n"); - } -#endif - if (watchdog_user_enabled) watchdog_enable_all_cpus(); } -- cgit v1.3-8-gc7d7 From 2a4ac63333584b2791986cf2270f5ba9a4b97606 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:16:40 +0800 Subject: cgroup: remove sparse tags from offline_css() This should have been removed in commit d7eeac1913ff ("cgroup: hold cgroup_mutex before calling css_offline"). While at it, update the comments. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 345fac8e4fba..41b559f51502 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4214,7 +4214,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, INIT_WORK(&css->dput_work, css_dput_fn); } -/* invoke ->post_create() on a new CSS and mark it online if successful */ +/* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { int ret = 0; @@ -4228,9 +4228,8 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return ret; } -/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ +/* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) - __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; -- cgit v1.3-8-gc7d7 From e0798ce27346edb8aa369b5b39af5a47fdf2b25c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 17:36:25 +0800 Subject: cgroup: remove struct cgroup_seqfile_state We can use struct cfent instead. v2: - remove cgroup_seqfile_release(). Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 45 +++++++++++++-------------------------------- 1 file changed, 13 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 41b559f51502..ed2104304833 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2397,11 +2397,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, * supports string->u64 maps, but can be extended in future. */ -struct cgroup_seqfile_state { - struct cftype *cft; - struct cgroup *cgroup; -}; - static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) { struct seq_file *sf = cb->state; @@ -2410,59 +2405,45 @@ static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) static int cgroup_seqfile_show(struct seq_file *m, void *arg) { - struct cgroup_seqfile_state *state = m->private; - struct cftype *cft = state->cft; + struct cfent *cfe = m->private; + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(state->cgroup, cft, &cb); + return cft->read_map(cgrp, cft, &cb); } - return cft->read_seq_string(state->cgroup, cft, m); -} - -static int cgroup_seqfile_release(struct inode *inode, struct file *file) -{ - struct seq_file *seq = file->private_data; - kfree(seq->private); - return single_release(inode, file); + return cft->read_seq_string(cgrp, cft, m); } static const struct file_operations cgroup_seqfile_operations = { .read = seq_read, .write = cgroup_file_write, .llseek = seq_lseek, - .release = cgroup_seqfile_release, + .release = single_release, }; static int cgroup_file_open(struct inode *inode, struct file *file) { int err; + struct cfent *cfe; struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cft = __d_cft(file->f_dentry); + cfe = __d_cfe(file->f_dentry); + cft = cfe->type; if (cft->read_map || cft->read_seq_string) { - struct cgroup_seqfile_state *state; - - state = kzalloc(sizeof(*state), GFP_USER); - if (!state) - return -ENOMEM; - - state->cft = cft; - state->cgroup = __d_cgrp(file->f_dentry->d_parent); file->f_op = &cgroup_seqfile_operations; - err = single_open(file, cgroup_seqfile_show, state); - if (err < 0) - kfree(state); - } else if (cft->open) + err = single_open(file, cgroup_seqfile_show, cfe); + } else if (cft->open) { err = cft->open(inode, file); - else - err = 0; + } return err; } -- cgit v1.3-8-gc7d7 From 6f4b7e632d78c2d91502211c430722cc66428492 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 16:18:36 +0800 Subject: cgroup: more naming cleanups Constantly use @cset for css_set variables and use @cgrp as cgroup variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 6 +++--- kernel/cgroup.c | 26 +++++++++++++------------- kernel/cpuset.c | 16 ++++++++-------- 3 files changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 297462b9f41a..00a7e07a1567 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -394,8 +394,8 @@ struct cgroup_map_cb { /* cftype->flags */ enum { - CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cg */ - CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cg */ + CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ + CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ }; @@ -513,7 +513,7 @@ struct cftype_set { }; struct cgroup_scanner { - struct cgroup *cg; + struct cgroup *cgrp; int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); void (*process_task)(struct task_struct *p, struct cgroup_scanner *scan); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed2104304833..9577bebe2546 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -466,7 +466,7 @@ static inline void put_css_set_taskexit(struct css_set *cset) * @new_cgrp: cgroup that's being entered by the task * @template: desired set of css pointers in css_set (pre-calculated) * - * Returns true if "cg" matches "old_cg" except for the hierarchy + * Returns true if "cset" matches "old_cset" except for the hierarchy * which "new_cgrp" belongs to, for which it should match "new_cgrp". */ static bool compare_css_sets(struct css_set *cset, @@ -1839,7 +1839,7 @@ EXPORT_SYMBOL_GPL(task_cgroup_path_from_hierarchy); struct task_and_cgroup { struct task_struct *task; struct cgroup *cgrp; - struct css_set *cg; + struct css_set *cset; }; struct cgroup_taskset { @@ -2057,8 +2057,8 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, tc = flex_array_get(group, i); old_cset = task_css_set(tc->task); - tc->cg = find_css_set(old_cset, cgrp); - if (!tc->cg) { + tc->cset = find_css_set(old_cset, cgrp); + if (!tc->cset) { retval = -ENOMEM; goto out_put_css_set_refs; } @@ -2071,7 +2071,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, */ for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - cgroup_task_migrate(tc->cgrp, tc->task, tc->cg); + cgroup_task_migrate(tc->cgrp, tc->task, tc->cset); } /* nothing is sensitive to fork() after this point. */ @@ -2091,9 +2091,9 @@ out_put_css_set_refs: if (retval) { for (i = 0; i < group_size; i++) { tc = flex_array_get(group, i); - if (!tc->cg) + if (!tc->cset) break; - put_css_set(tc->cg); + put_css_set(tc->cset); } } out_cancel_attach: @@ -2203,9 +2203,9 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) mutex_lock(&cgroup_mutex); for_each_active_root(root) { - struct cgroup *from_cg = task_cgroup_from_root(from, root); + struct cgroup *from_cgrp = task_cgroup_from_root(from, root); - retval = cgroup_attach_task(from_cg, tsk, false); + retval = cgroup_attach_task(from_cgrp, tsk, false); if (retval) break; } @@ -3305,8 +3305,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cg, &it); - while ((p = cgroup_iter_next(scan->cg, &it))) { + cgroup_iter_start(scan->cgrp, &it); + while ((p = cgroup_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3339,7 +3339,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cg, &it); + cgroup_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3385,7 +3385,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { struct cgroup_scanner scan; - scan.cg = from; + scan.cgrp = from; scan.test_task = NULL; /* select all tasks in cgroup */ scan.process_task = cgroup_transfer_one_task; scan.heap = NULL; diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 703bfd5a32a9..1b9c31549797 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -845,7 +845,7 @@ static void cpuset_change_cpumask(struct task_struct *tsk, { struct cpuset *cpus_cs; - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cg)); + cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cgrp)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } @@ -866,7 +866,7 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_cpumask; scan.heap = heap; @@ -1062,7 +1062,7 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, static void cpuset_change_nodemask(struct task_struct *p, struct cgroup_scanner *scan) { - struct cpuset *cs = cgroup_cs(scan->cg); + struct cpuset *cs = cgroup_cs(scan->cgrp); struct mm_struct *mm; int migrate; nodemask_t *newmems = scan->data; @@ -1102,7 +1102,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) guarantee_online_mems(mems_cs, &newmems); - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_nodemask; scan.heap = heap; @@ -1275,7 +1275,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) static void cpuset_change_flag(struct task_struct *tsk, struct cgroup_scanner *scan) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cg), tsk); + cpuset_update_task_spread_flag(cgroup_cs(scan->cgrp), tsk); } /* @@ -1295,7 +1295,7 @@ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { struct cgroup_scanner scan; - scan.cg = cs->css.cgroup; + scan.cgrp = cs->css.cgroup; scan.test_task = NULL; scan.process_task = cpuset_change_flag; scan.heap = heap; @@ -1971,7 +1971,7 @@ static int cpuset_css_online(struct cgroup *cgrp) struct cpuset *cs = cgroup_cs(cgrp); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cg; + struct cgroup *pos_cgrp; if (!parent) return 0; @@ -2003,7 +2003,7 @@ static int cpuset_css_online(struct cgroup *cgrp) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cg, parent) { + cpuset_for_each_child(tmp_cs, pos_cgrp, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; -- cgit v1.3-8-gc7d7 From 4e96ee8e981b5140a2bcc5fff0d5c0eef39a62ee Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 31 Jul 2013 09:50:50 +0800 Subject: cgroup: convert cgroup_ida to cgroup_idr This enables us to lookup a cgroup by its id. v4: - add a comment for idr_remove() in cgroup_offline_fn(). v3: - on success, idr_alloc() returns the id but not 0, so fix the BUG_ON() in cgroup_init(). - pass the right value to idr_alloc() so that the id for dummy cgroup is 0. Signed-off-by: Li Zefan Reviewed-by: Michal Hocko Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 33 +++++++++++++++++++++++++++------ 2 files changed, 29 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 00a7e07a1567..cca570e188fb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -161,7 +161,7 @@ struct cgroup_name { struct cgroup { unsigned long flags; /* "unsigned long" so bitops work */ - int id; /* ida allocated in-hierarchy ID */ + int id; /* idr allocated in-hierarchy ID */ /* * We link our 'sibling' struct into our parent's 'children'. @@ -322,7 +322,7 @@ struct cgroupfs_root { unsigned long flags; /* IDs for cgroups in this hierarchy */ - struct ida cgroup_ida; + struct idr cgroup_idr; /* The path to use for release notifications. */ char release_agent_path[PATH_MAX]; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9577bebe2546..3f6593333525 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -866,8 +866,6 @@ static void cgroup_free_fn(struct work_struct *work) */ dput(cgrp->parent->dentry); - ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); - /* * Drop the active superblock reference that we took when we * created the cgroup. This will free cgrp->root, if we are @@ -1379,6 +1377,7 @@ static void init_cgroup_root(struct cgroupfs_root *root) cgrp->root = root; RCU_INIT_POINTER(cgrp->name, &root_cgroup_name); init_cgroup_housekeeping(cgrp); + idr_init(&root->cgroup_idr); } static int cgroup_init_root_id(struct cgroupfs_root *root, int start, int end) @@ -1451,7 +1450,6 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) */ root->subsys_mask = opts->subsys_mask; root->flags = opts->flags; - ida_init(&root->cgroup_ida); if (opts->release_agent) strcpy(root->release_agent_path, opts->release_agent); if (opts->name) @@ -1467,7 +1465,7 @@ static void cgroup_free_root(struct cgroupfs_root *root) /* hierarhcy ID shoulid already have been released */ WARN_ON_ONCE(root->hierarchy_id); - ida_destroy(&root->cgroup_ida); + idr_destroy(&root->cgroup_idr); kfree(root); } } @@ -1582,6 +1580,11 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, mutex_lock(&cgroup_mutex); mutex_lock(&cgroup_root_mutex); + root_cgrp->id = idr_alloc(&root->cgroup_idr, root_cgrp, + 0, 1, GFP_KERNEL); + if (root_cgrp->id < 0) + goto unlock_drop; + /* Check for name clashes with existing mounts */ ret = -EBUSY; if (strlen(root->name)) @@ -4253,7 +4256,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_cgrp; rcu_assign_pointer(cgrp->name, name); - cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); + /* + * Temporarily set the pointer to NULL, so idr_find() won't return + * a half-baked cgroup. + */ + cgrp->id = idr_alloc(&root->cgroup_idr, NULL, 1, 0, GFP_KERNEL); if (cgrp->id < 0) goto err_free_name; @@ -4351,6 +4358,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, } } + idr_replace(&root->cgroup_idr, cgrp, cgrp->id); + err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); if (err) goto err_destroy; @@ -4377,7 +4386,7 @@ err_free_all: /* Release the reference count that we took on the superblock */ deactivate_super(sb); err_free_id: - ida_simple_remove(&root->cgroup_ida, cgrp->id); + idr_remove(&root->cgroup_idr, cgrp->id); err_free_name: kfree(rcu_dereference_raw(cgrp->name)); err_free_cgrp: @@ -4570,6 +4579,14 @@ static void cgroup_offline_fn(struct work_struct *work) /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); + /* + * We should remove the cgroup object from idr before its grace + * period starts, so we won't be looking up a cgroup while the + * cgroup is being freed. + */ + idr_remove(&cgrp->root->cgroup_idr, cgrp->id); + cgrp->id = -1; + dput(d); set_bit(CGRP_RELEASABLE, &parent->flags); @@ -4895,6 +4912,10 @@ int __init cgroup_init(void) BUG_ON(cgroup_init_root_id(&cgroup_dummy_root, 0, 1)); + err = idr_alloc(&cgroup_dummy_root.cgroup_idr, cgroup_dummy_top, + 0, 1, GFP_KERNEL); + BUG_ON(err < 0); + mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); -- cgit v1.3-8-gc7d7 From 876ede8b2b9880615be0de3ec7b8afd0a1786e76 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:51:47 +0800 Subject: cgroup: restructure the failure path in cgroup_write_event_control() It uses a single label and checks the validity of each pointer. This is err-prone, and actually we had a bug because one of the check was insufficient. Use multi lables as we do in other places. v2: - drop initializations of local variables. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f6593333525..9f6dab22289e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3934,11 +3934,11 @@ static void cgroup_event_ptable_queue_proc(struct file *file, static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, const char *buffer) { - struct cgroup_event *event = NULL; + struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; - struct file *efile = NULL; - struct file *cfile = NULL; + struct file *efile; + struct file *cfile; char *endp; int ret; @@ -3964,31 +3964,31 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, efile = eventfd_fget(efd); if (IS_ERR(efile)) { ret = PTR_ERR(efile); - goto fail; + goto out_kfree; } event->eventfd = eventfd_ctx_fileget(efile); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); - goto fail; + goto out_put_efile; } cfile = fget(cfd); if (!cfile) { ret = -EBADF; - goto fail; + goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ ret = inode_permission(file_inode(cfile), MAY_READ); if (ret < 0) - goto fail; + goto out_put_cfile; event->cft = __file_cft(cfile); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); - goto fail; + goto out_put_cfile; } /* @@ -3998,18 +3998,18 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto fail; + goto out_put_cfile; } ret = event->cft->register_event(cgrp, event->cft, event->eventfd, buffer); if (ret) - goto fail; + goto out_put_cfile; efile->f_op->poll(efile, &event->pt); @@ -4029,16 +4029,13 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, return 0; -fail: - if (cfile) - fput(cfile); - - if (event && event->eventfd && !IS_ERR(event->eventfd)) - eventfd_ctx_put(event->eventfd); - - if (!IS_ERR_OR_NULL(efile)) - fput(efile); - +out_put_cfile: + fput(cfile); +out_put_eventfd: + eventfd_ctx_put(event->eventfd); +out_put_efile: + fput(efile); +out_kfree: kfree(event); return ret; -- cgit v1.3-8-gc7d7 From b395890a092d8ecbe54f005179e3dec4b6bf752a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 1 Aug 2013 09:52:15 +0800 Subject: cgroup: rename cgroup_pidlist->mutex It's a rw_semaphore not a mutex. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 9f6dab22289e..9420662df87e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3436,7 +3436,7 @@ struct cgroup_pidlist { /* pointer to the cgroup we belong to, for list removal purposes */ struct cgroup *owner; /* protects the other fields */ - struct rw_semaphore mutex; + struct rw_semaphore rwsem; }; /* @@ -3509,7 +3509,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, struct pid_namespace *ns = task_active_pid_ns(current); /* - * We can't drop the pidlist_mutex before taking the l->mutex in case + * We can't drop the pidlist_mutex before taking the l->rwsem in case * the last ref-holder is trying to remove l from the list at the same * time. Holding the pidlist_mutex precludes somebody taking whichever * list we find out from under us - compare release_pid_array(). @@ -3518,7 +3518,7 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { /* make sure l doesn't vanish out from under us */ - down_write(&l->mutex); + down_write(&l->rwsem); mutex_unlock(&cgrp->pidlist_mutex); return l; } @@ -3529,8 +3529,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_unlock(&cgrp->pidlist_mutex); return l; } - init_rwsem(&l->mutex); - down_write(&l->mutex); + init_rwsem(&l->rwsem); + down_write(&l->rwsem); l->key.type = type; l->key.ns = get_pid_ns(ns); l->owner = cgrp; @@ -3591,7 +3591,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, l->list = array; l->length = length; l->use_count++; - up_write(&l->mutex); + up_write(&l->rwsem); *lp = l; return 0; } @@ -3669,7 +3669,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) int index = 0, pid = *pos; int *iter; - down_read(&l->mutex); + down_read(&l->rwsem); if (pid) { int end = l->length; @@ -3696,7 +3696,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct cgroup_pidlist *l = s->private; - up_read(&l->mutex); + up_read(&l->rwsem); } static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) @@ -3742,7 +3742,7 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) * pidlist_mutex, we have to take pidlist_mutex first. */ mutex_lock(&l->owner->pidlist_mutex); - down_write(&l->mutex); + down_write(&l->rwsem); BUG_ON(!l->use_count); if (!--l->use_count) { /* we're the last user if refcount is 0; remove and free */ @@ -3750,12 +3750,12 @@ static void cgroup_release_pid_array(struct cgroup_pidlist *l) mutex_unlock(&l->owner->pidlist_mutex); pidlist_free(l->list); put_pid_ns(l->key.ns); - up_write(&l->mutex); + up_write(&l->rwsem); kfree(l); return; } mutex_unlock(&l->owner->pidlist_mutex); - up_write(&l->mutex); + up_write(&l->rwsem); } static int cgroup_pidlist_release(struct inode *inode, struct file *file) -- cgit v1.3-8-gc7d7 From 41e85ce8220c6e5fdef706fda6696cd291115b63 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 1 Aug 2013 18:59:41 +0200 Subject: hung_task debugging: Print more info when reporting the problem printk(KERN_ERR) from check_hung_task() likely means we have a bug, but unlike BUG_ON()/WARN_ON ()it doesn't show the kernel version, this complicates the bug-reports investigation. Add the additional pr_err() to print tainted/release/version like dump_stack_print_info() does, the output becomes: INFO: task perl:504 blocked for more than 2 seconds. Not tainted 3.11.0-rc1-10367-g136bb46-dirty #1763 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ... While at it, turn the old printk's into pr_err(). Signed-off-by: Oleg Nesterov Cc: ahecox@redhat.com Cc: Christopher Williams Cc: dwysocha@redhat.com Cc: gavin@redhat.com Cc: Mandeep Singh Baines Cc: nshi@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130801165941.GA17544@redhat.com Signed-off-by: Ingo Molnar --- kernel/hung_task.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 6df614912b9d..3e97fb126e6b 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include #include #include +#include /* * The number of tasks checked: @@ -99,10 +100,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) * Ok, the task did not get scheduled for more than 2 minutes, * complain: */ - printk(KERN_ERR "INFO: task %s:%d blocked for more than " - "%ld seconds.\n", t->comm, t->pid, timeout); - printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" - " disables this message.\n"); + pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n", + t->comm, t->pid, timeout); + pr_err(" %s %s %.*s\n", + print_tainted(), init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" + " disables this message.\n"); sched_show_task(t); debug_show_held_locks(t); -- cgit v1.3-8-gc7d7 From 16cf48a6d3e8f9ebe3c3231c12cbe4b0c4ed4d24 Mon Sep 17 00:00:00 2001 From: Andreas Bießmann Date: Fri, 2 Aug 2013 12:23:34 +0200 Subject: register_console: prevent adding the same console twice MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch guards the console_drivers list to be corrupted. The for_each_console() macro insist on a strictly forward list ended by NULL: con0->next->con1->next->NULL Without this patch it may happen easily to destroy this list for example by adding 'earlyprintk' twice, especially on embedded devices where the early console is often a single static instance. This will result in the following list: con0->next->con0 This in turn will result in an endless loop in console_unlock() later on by printing the first __log_buf line endlessly. Signed-off-by: Andreas Bießmann Cc: Kay Sievers Cc: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- kernel/printk/printk.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5b5a7080e2a5..b4e8500afdb3 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2226,6 +2226,13 @@ void register_console(struct console *newcon) struct console *bcon = NULL; struct console_cmdline *c; + if (console_drivers) + for_each_console(bcon) + if (WARN(bcon == newcon, + "console '%s%d' already registered\n", + bcon->name, bcon->index)) + return; + /* * before we register a new CON_BOOT console, make sure we don't * already have a valid console -- cgit v1.3-8-gc7d7 From d6efc2f7240b4e55590df69d74f33fdb72ce934a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 5 Aug 2013 15:02:49 -0700 Subject: x86, asmlinkage, power: Make various symbols used by the suspend asm code visible Signed-off-by: Andi Kleen Link: http://lkml.kernel.org/r/1375740170-7446-16-git-send-email-andi@firstfloor.org Signed-off-by: H. Peter Anvin --- arch/x86/power/cpu.c | 8 ++++---- arch/x86/power/hibernate_64.c | 12 ++++++------ kernel/power/hibernate.c | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 1cf5b300305e..424f4c97a44d 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -25,10 +25,10 @@ #include #ifdef CONFIG_X86_32 -unsigned long saved_context_ebx; -unsigned long saved_context_esp, saved_context_ebp; -unsigned long saved_context_esi, saved_context_edi; -unsigned long saved_context_eflags; +__visible unsigned long saved_context_ebx; +__visible unsigned long saved_context_esp, saved_context_ebp; +__visible unsigned long saved_context_esi, saved_context_edi; +__visible unsigned long saved_context_eflags; #endif struct saved_context saved_context; diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index a0fde91c16cf..304fca20d96e 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -20,26 +20,26 @@ #include /* References to section boundaries */ -extern const void __nosave_begin, __nosave_end; +extern __visible const void __nosave_begin, __nosave_end; /* Defined in hibernate_asm_64.S */ -extern int restore_image(void); +extern asmlinkage int restore_image(void); /* * Address to jump to in the last phase of restore in order to get to the image * kernel's text (this value is passed in the image header). */ -unsigned long restore_jump_address; +unsigned long restore_jump_address __visible; /* * Value of the cr3 register from before the hibernation (this value is passed * in the image header). */ -unsigned long restore_cr3; +unsigned long restore_cr3 __visible; -pgd_t *temp_level4_pgt; +pgd_t *temp_level4_pgt __visible; -void *relocated_restore_code; +void *relocated_restore_code __visible; static void *alloc_pgt_page(void *context) { diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..3085e62a80a5 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -39,7 +39,7 @@ static int resume_delay; static char resume_file[256] = CONFIG_PM_STD_PARTITION; dev_t swsusp_resume_device; sector_t swsusp_resume_block; -int in_suspend __nosavedata; +__visible int in_suspend __nosavedata; enum { HIBERNATION_INVALID, -- cgit v1.3-8-gc7d7 From cf4957f17f2a89984915ea808876d9c82225b862 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 24 Oct 2012 13:37:58 +0200 Subject: perf: Add PERF_EVENT_IOC_ID ioctl to return event ID The only way to get the event ID is by reading the event fd, followed by parsing the ID value out of the returned data. While this is ok for current read format used by perf tool, it is not ok when we use PERF_FORMAT_GROUP format. With this format the data are returned for the whole group and there's no way to find out what ID belongs to our fd (if we are not group leader event). Adding a simple ioctl that returns event primary ID for given fd. Signed-off-by: Jiri Olsa Acked-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-v1bn5cto707jn0bon34afqr1@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 9 +++++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index efef1d37a371..62c25a25291c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -321,6 +321,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, u64 *) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/events/core.c b/kernel/events/core.c index 916cf1f593b4..5200b608b481 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3568,6 +3568,15 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_EVENT_IOC_PERIOD: return perf_event_period(event, (u64 __user *)arg); + case PERF_EVENT_IOC_ID: + { + u64 id = primary_event_id(event); + + if (copy_to_user((void __user *)arg, &id, sizeof(id))) + return -EFAULT; + return 0; + } + case PERF_EVENT_IOC_SET_OUTPUT: { int ret; -- cgit v1.3-8-gc7d7 From 6f5ab0019fd328b50a8488c9e5193fc1dbd8d6ed Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 15 Oct 2012 20:13:45 +0200 Subject: perf: Do not get values from disabled counters in group format read It's possible some of the counters in the group could be disabled when sampling member of the event group is reading the rest via PERF_SAMPLE_READ sample type processing. Disabled counters could then produce wrong numbers. Fixing that by reading only enabled counters for PERF_SAMPLE_READ sample type processing. Signed-off-by: Jiri Olsa Acked-by: Namhyung Kim Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-wwkjb0bbcuslnz0klrmqi26r@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 5200b608b481..e82e70025d42 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4388,7 +4388,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, list_for_each_entry(sub, &leader->sibling_list, group_entry) { n = 0; - if (sub != event) + if ((sub != event) && + (sub->state == PERF_EVENT_STATE_ACTIVE)) sub->pmu->read(sub); values[n++] = perf_event_count(sub); -- cgit v1.3-8-gc7d7 From 8af01f56a03e9cbd91a55d688fce1315021efba8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: s/cgroup_subsys_state/cgroup_css/ s/task_subsys_state/task_css/ The names of the two struct cgroup_subsys_state accessors - cgroup_subsys_state() and task_subsys_state() - are somewhat awkward. The former clashes with the type name and the latter doesn't even indicate it's somehow related to cgroup. We're about to revamp large portion of cgroup API, so, let's rename them so that they're less awkward. Most per-controller usages of the accessors are localized in accessor wrappers and given the amount of scheduled changes, this isn't gonna add any noticeable headache. Rename cgroup_subsys_state() to cgroup_css() and task_subsys_state() to task_css(). This patch is pure rename. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.h | 5 ++--- fs/bio.c | 2 +- include/linux/cgroup.h | 31 +++++++++++++++++++------------ include/net/cls_cgroup.h | 4 ++-- include/net/netprio_cgroup.h | 4 ++-- kernel/cgroup.c | 2 +- kernel/cgroup_freezer.c | 4 ++-- kernel/cpuset.c | 6 +++--- kernel/events/core.c | 6 +++--- kernel/sched/core.c | 4 ++-- kernel/sched/cpuacct.c | 4 ++-- kernel/sched/sched.h | 6 +++--- mm/hugetlb_cgroup.c | 6 ++---- mm/memcontrol.c | 5 ++--- mm/vmpressure.c | 2 +- net/core/netprio_cgroup.c | 2 +- net/sched/cls_cgroup.c | 4 ++-- security/device_cgroup.c | 4 ++-- 18 files changed, 52 insertions(+), 49 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8056c03a3382..628e50f6f8a8 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -181,14 +181,13 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx); static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), + return container_of(cgroup_css(cgroup, blkio_subsys_id), struct blkcg, css); } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, blkio_subsys_id), - struct blkcg, css); + return container_of(task_css(tsk, blkio_subsys_id), struct blkcg, css); } static inline struct blkcg *bio_blkcg(struct bio *bio) diff --git a/fs/bio.c b/fs/bio.c index 94bbc04dba77..8e0348f6e5bd 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1946,7 +1946,7 @@ int bio_associate_current(struct bio *bio) /* associate blkcg if exists */ rcu_read_lock(); - css = task_subsys_state(current, blkio_subsys_id); + css = task_css(current, blkio_subsys_id); if (css && css_tryget(css)) bio->bi_css = css; rcu_read_unlock(); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 44dd422d7e9b..552c5feef733 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -647,8 +647,15 @@ struct cgroup_subsys { #undef IS_SUBSYS_ENABLED #undef SUBSYS -static inline struct cgroup_subsys_state *cgroup_subsys_state( - struct cgroup *cgrp, int subsys_id) +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @subsys_id: the subsystem of interest + * + * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + */ +static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + int subsys_id) { return cgrp->subsys[subsys_id]; } @@ -678,7 +685,7 @@ extern struct mutex cgroup_mutex; #endif /** - * task_subsys_state_check - obtain css for (task, subsys) w/ extra access conds + * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() @@ -686,7 +693,7 @@ extern struct mutex cgroup_mutex; * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ -#define task_subsys_state_check(task, subsys_id, __c) \ +#define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** @@ -701,22 +708,22 @@ static inline struct css_set *task_css_set(struct task_struct *task) } /** - * task_subsys_state - obtain css for (task, subsys) + * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * - * See task_subsys_state_check(). + * See task_css_check(). */ -static inline struct cgroup_subsys_state * -task_subsys_state(struct task_struct *task, int subsys_id) +static inline struct cgroup_subsys_state *task_css(struct task_struct *task, + int subsys_id) { - return task_subsys_state_check(task, subsys_id, false); + return task_css_check(task, subsys_id, false); } -static inline struct cgroup* task_cgroup(struct task_struct *task, - int subsys_id) +static inline struct cgroup *task_cgroup(struct task_struct *task, + int subsys_id) { - return task_subsys_state(task, subsys_id)->cgroup; + return task_css(task, subsys_id)->cgroup; } /** diff --git a/include/net/cls_cgroup.h b/include/net/cls_cgroup.h index 0fee0617fb7d..52adaa75dac9 100644 --- a/include/net/cls_cgroup.h +++ b/include/net/cls_cgroup.h @@ -35,7 +35,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - classid = container_of(task_subsys_state(p, net_cls_subsys_id), + classid = container_of(task_css(p, net_cls_subsys_id), struct cgroup_cls_state, css)->classid; rcu_read_unlock(); @@ -51,7 +51,7 @@ static inline u32 task_cls_classid(struct task_struct *p) return 0; rcu_read_lock(); - css = task_subsys_state(p, net_cls_subsys_id); + css = task_css(p, net_cls_subsys_id); if (css) classid = container_of(css, struct cgroup_cls_state, css)->classid; diff --git a/include/net/netprio_cgroup.h b/include/net/netprio_cgroup.h index 50ab8c26ab59..8110fa7ae60a 100644 --- a/include/net/netprio_cgroup.h +++ b/include/net/netprio_cgroup.h @@ -39,7 +39,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx; rcu_read_lock(); - css = task_subsys_state(p, net_prio_subsys_id); + css = task_css(p, net_prio_subsys_id); idx = css->cgroup->id; rcu_read_unlock(); return idx; @@ -53,7 +53,7 @@ static inline u32 task_netprioidx(struct task_struct *p) u32 idx = 0; rcu_read_lock(); - css = task_subsys_state(p, net_prio_subsys_id); + css = task_css(p, net_prio_subsys_id); if (css) idx = css->cgroup->id; rcu_read_unlock(); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ae4c46834633..0b3caa3220cb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -81,7 +81,7 @@ */ #ifdef CONFIG_PROVE_RCU DEFINE_MUTEX(cgroup_mutex); -EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for task_subsys_state_check() */ +EXPORT_SYMBOL_GPL(cgroup_mutex); /* only for lockdep */ #else static DEFINE_MUTEX(cgroup_mutex); #endif diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 75dda1ea5026..9d3f61566fec 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -47,13 +47,13 @@ struct freezer { static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of(cgroup_subsys_state(cgroup, freezer_subsys_id), + return container_of(cgroup_css(cgroup, freezer_subsys_id), struct freezer, css); } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_subsys_state(task, freezer_subsys_id), + return container_of(task_css(task, freezer_subsys_id), struct freezer, css); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1b9c31549797..be4512ba2c0c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -117,14 +117,14 @@ struct cpuset { /* Retrieve the cpuset for a cgroup */ static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpuset_subsys_id), + return container_of(cgroup_css(cgrp, cpuset_subsys_id), struct cpuset, css); } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_subsys_state(task, cpuset_subsys_id), + return container_of(task_css(task, cpuset_subsys_id), struct cpuset, css); } @@ -2724,7 +2724,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v) goto out_free; rcu_read_lock(); - css = task_subsys_state(tsk, cpuset_subsys_id); + css = task_css(tsk, cpuset_subsys_id); retval = cgroup_path(css->cgroup, buf, PAGE_SIZE); rcu_read_unlock(); if (retval < 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 1833bc5a84a7..414c61f4d776 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -340,8 +340,8 @@ struct perf_cgroup { static inline struct perf_cgroup * perf_cgroup_from_task(struct task_struct *task) { - return container_of(task_subsys_state(task, perf_subsys_id), - struct perf_cgroup, css); + return container_of(task_css(task, perf_subsys_id), + struct perf_cgroup, css); } static inline bool @@ -7798,7 +7798,7 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) static void perf_cgroup_css_free(struct cgroup *cont) { struct perf_cgroup *jc; - jc = container_of(cgroup_subsys_state(cont, perf_subsys_id), + jc = container_of(cgroup_css(cont, perf_subsys_id), struct perf_cgroup, css); free_percpu(jc->info); kfree(jc); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b1f2e533b95..323d907eac1a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6761,7 +6761,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->put_prev_task(rq, tsk); - tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, + tg = container_of(task_css_check(tsk, cpu_cgroup_subsys_id, lockdep_is_held(&tsk->sighand->siglock)), struct task_group, css); tg = autogroup_task_group(tsk, tg); @@ -7086,7 +7086,7 @@ int sched_rt_handler(struct ctl_table *table, int write, /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpu_cgroup_subsys_id), + return container_of(cgroup_css(cgrp, cpu_cgroup_subsys_id), struct task_group, css); } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dbb7e2cd95eb..4a210faaab77 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -36,14 +36,14 @@ struct cpuacct { /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + return container_of(cgroup_css(cgrp, cpuacct_subsys_id), struct cpuacct, css); } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + return container_of(task_css(tsk, cpuacct_subsys_id), struct cpuacct, css); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef0a7b2439dd..471a56db05ea 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -665,9 +665,9 @@ extern int group_balance_cpu(struct sched_group *sg); /* * Return the group to which this tasks belongs. * - * We cannot use task_subsys_state() and friends because the cgroup - * subsystem changes that value before the cgroup_subsys::attach() method - * is called, therefore we cannot pin it and might observe the wrong value. + * We cannot use task_css() and friends because the cgroup subsystem + * changes that value before the cgroup_subsys::attach() method is called, + * therefore we cannot pin it and might observe the wrong value. * * The same is true for autogroup's p->signal->autogroup->tg, the autogroup * core changes this before calling sched_move_task(). diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9cea7de22ffb..50f213fc52c7 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -42,15 +42,13 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) static inline struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) { - return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, - hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(cgroup_css(cgroup, hugetlb_subsys_id)); } static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { - return hugetlb_cgroup_from_css(task_subsys_state(task, - hugetlb_subsys_id)); + return hugetlb_cgroup_from_css(task_css(task, hugetlb_subsys_id)); } static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d12ca6f3c293..b47bd3ad3c2b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1037,8 +1037,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { - return mem_cgroup_from_css( - cgroup_subsys_state(cont, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -1051,7 +1050,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); + return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 736a6011c2c8..7f1654d3cec7 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -76,7 +76,7 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) { - return css_to_vmpressure(cgroup_subsys_state(cg, mem_cgroup_subsys_id)); + return css_to_vmpressure(cgroup_css(cg, mem_cgroup_subsys_id)); } static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e533259dce3c..ccf852311987 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -31,7 +31,7 @@ static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id), + return container_of(cgroup_css(cgrp, net_prio_subsys_id), struct cgroup_netprio_state, css); } diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 3a294eb98d61..5ee72a001df0 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -25,13 +25,13 @@ static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) { - return container_of(cgroup_subsys_state(cgrp, net_cls_subsys_id), + return container_of(cgroup_css(cgrp, net_cls_subsys_id), struct cgroup_cls_state, css); } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_subsys_state(p, net_cls_subsys_id), + return container_of(task_css(p, net_cls_subsys_id), struct cgroup_cls_state, css); } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index e8aad69f0d69..87a0a037fbd6 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -58,12 +58,12 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) { - return css_to_devcgroup(cgroup_subsys_state(cgroup, devices_subsys_id)); + return css_to_devcgroup(cgroup_css(cgroup, devices_subsys_id)); } static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { - return css_to_devcgroup(task_subsys_state(task, devices_subsys_id)); + return css_to_devcgroup(task_css(task, devices_subsys_id)); } struct cgroup_subsys devices_subsys; -- cgit v1.3-8-gc7d7 From c9710d8018273b0740e0794858f1961fcea5e61a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cpuset: drop "const" qualifiers from struct cpuset instances cpuset uses "const" qualifiers on struct cpuset in some functions; however, it doesn't work well when a value derived from returned const pointer has to be passed to an accessor. It's C after all. Drop the "const" qualifiers except for the trivially leaf ones. This patch doesn't make any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cpuset.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index be4512ba2c0c..f7371341d42a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -128,7 +128,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) struct cpuset, css); } -static inline struct cpuset *parent_cs(const struct cpuset *cs) +static inline struct cpuset *parent_cs(struct cpuset *cs) { struct cgroup *pcgrp = cs->css.cgroup->parent; @@ -319,8 +319,7 @@ static struct file_system_type cpuset_fs_type = { * * Call with callback_mutex held. */ -static void guarantee_online_cpus(const struct cpuset *cs, - struct cpumask *pmask) +static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) { while (!cpumask_intersects(cs->cpus_allowed, cpu_online_mask)) cs = parent_cs(cs); @@ -338,7 +337,7 @@ static void guarantee_online_cpus(const struct cpuset *cs, * * Call with callback_mutex held. */ -static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) +static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) { while (!nodes_intersects(cs->mems_allowed, node_states[N_MEMORY])) cs = parent_cs(cs); @@ -383,7 +382,7 @@ static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q) * alloc_trial_cpuset - allocate a trial cpuset * @cs: the cpuset that the trial cpuset duplicates */ -static struct cpuset *alloc_trial_cpuset(const struct cpuset *cs) +static struct cpuset *alloc_trial_cpuset(struct cpuset *cs) { struct cpuset *trial; @@ -430,7 +429,7 @@ static void free_trial_cpuset(struct cpuset *trial) * Return 0 if valid, -errno if not. */ -static int validate_change(const struct cpuset *cur, const struct cpuset *trial) +static int validate_change(struct cpuset *cur, struct cpuset *trial) { struct cgroup *cgrp; struct cpuset *c, *par; @@ -2343,7 +2342,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) void cpuset_cpus_allowed_fallback(struct task_struct *tsk) { - const struct cpuset *cpus_cs; + struct cpuset *cpus_cs; rcu_read_lock(); cpus_cs = effective_cpumask_cpuset(task_cs(tsk)); @@ -2416,7 +2415,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall * (an unusual configuration), then returns the root cpuset. */ -static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) +static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) { while (!(is_mem_exclusive(cs) || is_mem_hardwall(cs)) && parent_cs(cs)) cs = parent_cs(cs); @@ -2486,7 +2485,7 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs) */ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - const struct cpuset *cs; /* current cpuset ancestors */ + struct cpuset *cs; /* current cpuset ancestors */ int allowed; /* is allocation in zone z allowed? */ if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) -- cgit v1.3-8-gc7d7 From 72c97e54e0f043d33b246d7460ae0a36c4b8c643 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:22 -0400 Subject: cgroup: add subsystem pointer to cgroup_subsys_state Currently, given a cgroup_subsys_state, there's no way to find out which subsystem the css is for, which we'll need to convert the cgroup controller API to primarily use @css instead of @cgroup. This patch adds cgroup_subsys_state->ss which points to the subsystem the @css belongs to. While at it, remove the comment about accessing @css->cgroup to determine the hierarchy. cgroup core will provide API to traverse hierarchy of css'es and we don't want subsystems to directly walk cgroup hierarchies anymore. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 9 ++++----- kernel/cgroup.c | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 552c5feef733..821678aae4db 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -66,13 +66,12 @@ enum cgroup_subsys_id { /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { - /* - * The cgroup that this subsystem is attached to. Useful - * for subsystems that want to know about the cgroup - * hierarchy structure - */ + /* the cgroup that this css is attached to */ struct cgroup *cgroup; + /* the cgroup subsystem that this css is attached to */ + struct cgroup_subsys *ss; + /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b3caa3220cb..4234428f1014 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4186,6 +4186,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, struct cgroup *cgrp) { css->cgroup = cgrp; + css->ss = ss; css->flags = 0; css->id = NULL; if (cgrp == cgroup_dummy_top) -- cgit v1.3-8-gc7d7 From a7c6d554aa01236ac2a9f851ab0f75704f76dfa2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add/update accessors which obtain subsys specific data from css css (cgroup_subsys_state) is usually embedded in a subsys specific data structure. Subsystems either use container_of() directly to cast from css to such data structure or has an accessor function wrapping such cast. As cgroup as whole is moving towards using css as the main interface handle, add and update such accessors to ease dealing with css's. All accessors explicitly handle NULL input and return NULL in those cases. While this looks like an extra branch in the code, as all controllers specific data structures have css as the first field, the casting doesn't involve any offsetting and the compiler can trivially optimize out the branch. * blkio, freezer, cpuset, cpu, cpuacct and net_cls didn't have such accessor. Added. * memory, hugetlb and devices already had one but didn't explicitly handle NULL input. Updated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.h | 12 ++++++++---- kernel/cgroup_freezer.c | 11 +++++++---- kernel/cpuset.c | 11 +++++++---- kernel/sched/core.c | 8 ++++++-- kernel/sched/cpuacct.c | 11 +++++++---- mm/hugetlb_cgroup.c | 2 +- mm/memcontrol.c | 2 +- net/sched/cls_cgroup.c | 11 +++++++---- security/device_cgroup.c | 2 +- 9 files changed, 45 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 628e50f6f8a8..8e5863e900bf 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -179,21 +179,25 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, void blkg_conf_finish(struct blkg_conf_ctx *ctx); +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { - return container_of(cgroup_css(cgroup, blkio_subsys_id), - struct blkcg, css); + return css_to_blkcg(cgroup_css(cgroup, blkio_subsys_id)); } static inline struct blkcg *task_blkcg(struct task_struct *tsk) { - return container_of(task_css(tsk, blkio_subsys_id), struct blkcg, css); + return css_to_blkcg(task_css(tsk, blkio_subsys_id)); } static inline struct blkcg *bio_blkcg(struct bio *bio) { if (bio && bio->bi_css) - return container_of(bio->bi_css, struct blkcg, css); + return css_to_blkcg(bio->bi_css); return task_blkcg(current); } diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 9d3f61566fec..1db686e47a22 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -45,16 +45,19 @@ struct freezer { spinlock_t lock; }; +static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct freezer, css) : NULL; +} + static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) { - return container_of(cgroup_css(cgroup, freezer_subsys_id), - struct freezer, css); + return css_freezer(cgroup_css(cgroup, freezer_subsys_id)); } static inline struct freezer *task_freezer(struct task_struct *task) { - return container_of(task_css(task, freezer_subsys_id), - struct freezer, css); + return css_freezer(task_css(task, freezer_subsys_id)); } static struct freezer *parent_freezer(struct freezer *freezer) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f7371341d42a..6e9cbdde25bd 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -114,18 +114,21 @@ struct cpuset { int relax_domain_level; }; +static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuset, css) : NULL; +} + /* Retrieve the cpuset for a cgroup */ static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpuset_subsys_id), - struct cpuset, css); + return css_cs(cgroup_css(cgrp, cpuset_subsys_id)); } /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { - return container_of(task_css(task, cpuset_subsys_id), - struct cpuset, css); + return css_cs(task_css(task, cpuset_subsys_id)); } static inline struct cpuset *parent_cs(struct cpuset *cs) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 323d907eac1a..5bccb0277129 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7083,11 +7083,15 @@ int sched_rt_handler(struct ctl_table *table, int write, #ifdef CONFIG_CGROUP_SCHED +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + /* return corresponding task_group object of a cgroup */ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpu_cgroup_subsys_id), - struct task_group, css); + return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); } static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 4a210faaab77..8ccfa10cc89f 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -33,18 +33,21 @@ struct cpuacct { struct kernel_cpustat __percpu *cpustat; }; +static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cpuacct, css) : NULL; +} + /* return cpu accounting group corresponding to this container */ static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, cpuacct_subsys_id), - struct cpuacct, css); + return css_ca(cgroup_css(cgrp, cpuacct_subsys_id)); } /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { - return container_of(task_css(tsk, cpuacct_subsys_id), - struct cpuacct, css); + return css_ca(task_css(tsk, cpuacct_subsys_id)); } static inline struct cpuacct *__parent_ca(struct cpuacct *ca) diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d2f9fc0b186e..95585a0b9c8d 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -36,7 +36,7 @@ static struct hugetlb_cgroup *root_h_cgroup __read_mostly; static inline struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct hugetlb_cgroup, css); + return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } static inline diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b47bd3ad3c2b..11d659e3b08e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -486,7 +486,7 @@ static DEFINE_MUTEX(memcg_create_mutex); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { - return container_of(s, struct mem_cgroup, css); + return s ? container_of(s, struct mem_cgroup, css) : NULL; } /* Some nice accessors for the vmpressure. */ diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 5ee72a001df0..af412ab2b477 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -23,16 +23,19 @@ #include #include +static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct cgroup_cls_state, css) : NULL; +} + static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) { - return container_of(cgroup_css(cgrp, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(cgroup_css(cgrp, net_cls_subsys_id)); } static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { - return container_of(task_css(p, net_cls_subsys_id), - struct cgroup_cls_state, css); + return css_cls_state(task_css(p, net_cls_subsys_id)); } static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 87a0a037fbd6..90953648c643 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -53,7 +53,7 @@ struct dev_cgroup { static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) { - return container_of(s, struct dev_cgroup, css); + return s ? container_of(s, struct dev_cgroup, css) : NULL; } static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) -- cgit v1.3-8-gc7d7 From 6387698699afd72d6304566fb6ccf84bffe07c56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add css_parent() Currently, controllers have to explicitly follow the cgroup hierarchy to find the parent of a given css. cgroup is moving towards using cgroup_subsys_state as the main controller interface construct, so let's provide a way to climb the hierarchy using just csses. This patch implements css_parent() which, given a css, returns its parent. The function is guarnateed to valid non-NULL parent css as long as the target css is not at the top of the hierarchy. freezer, cpuset, cpu, cpuacct, hugetlb, memory, net_cls and devices are converted to use css_parent() instead of accessing cgroup->parent directly. * __parent_ca() is dropped from cpuacct and its usage is replaced with parent_ca(). The only difference between the two was NULL test on cgroup->parent which is now embedded in css_parent() making the distinction moot. Note that eventually a css->parent field will be added to css and the NULL check in css_parent() will go away. This patch shouldn't cause any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- block/blk-cgroup.h | 4 +--- include/linux/cgroup.h | 15 +++++++++++++++ kernel/cgroup_freezer.c | 8 ++------ kernel/cpuset.c | 6 +----- kernel/sched/core.c | 9 +++------ kernel/sched/cpuacct.c | 11 ++--------- mm/hugetlb_cgroup.c | 6 +----- mm/memcontrol.c | 39 +++++++++++---------------------------- net/sched/cls_cgroup.c | 8 +++++--- security/device_cgroup.c | 18 +++++------------- 10 files changed, 46 insertions(+), 78 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 8e5863e900bf..b6802c46d68f 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -209,9 +209,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio) */ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) { - struct cgroup *pcg = blkcg->css.cgroup->parent; - - return pcg ? cgroup_to_blkcg(pcg) : NULL; + return css_to_blkcg(css_parent(&blkcg->css)); } /** diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 821678aae4db..18112a3bb12b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -646,6 +646,21 @@ struct cgroup_subsys { #undef IS_SUBSYS_ENABLED #undef SUBSYS +/** + * css_parent - find the parent css + * @css: the target cgroup_subsys_state + * + * Return the parent css of @css. This function is guaranteed to return + * non-NULL parent as long as @css isn't the root. + */ +static inline +struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) +{ + struct cgroup *parent_cgrp = css->cgroup->parent; + + return parent_cgrp ? parent_cgrp->subsys[css->ss->subsys_id] : NULL; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 1db686e47a22..657a73cd44c4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -62,11 +62,7 @@ static inline struct freezer *task_freezer(struct task_struct *task) static struct freezer *parent_freezer(struct freezer *freezer) { - struct cgroup *pcg = freezer->css.cgroup->parent; - - if (pcg) - return cgroup_freezer(pcg); - return NULL; + return css_freezer(css_parent(&freezer->css)); } bool cgroup_freezing(struct task_struct *task) @@ -234,7 +230,7 @@ static void freezer_fork(struct task_struct *task) * The root cgroup is non-freezable, so we can skip the * following check. */ - if (!freezer->css.cgroup->parent) + if (!parent_freezer(freezer)) goto out; spin_lock_irq(&freezer->lock); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6e9cbdde25bd..259a4af37e69 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -133,11 +133,7 @@ static inline struct cpuset *task_cs(struct task_struct *task) static inline struct cpuset *parent_cs(struct cpuset *cs) { - struct cgroup *pcgrp = cs->css.cgroup->parent; - - if (pcgrp) - return cgroup_cs(pcgrp); - return NULL; + return css_cs(css_parent(&cs->css)); } #ifdef CONFIG_NUMA diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5bccb0277129..7a10742b389a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7114,13 +7114,10 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) static int cpu_cgroup_css_online(struct cgroup *cgrp) { struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent; + struct task_group *parent = css_tg(css_parent(&tg->css)); - if (!cgrp->parent) - return 0; - - parent = cgroup_tg(cgrp->parent); - sched_online_group(tg, parent); + if (parent) + sched_online_group(tg, parent); return 0; } diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 8ccfa10cc89f..f6926a149a71 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -50,16 +50,9 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) return css_ca(task_css(tsk, cpuacct_subsys_id)); } -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); -} - static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); + return css_ca(css_parent(&ca->css)); } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); @@ -284,7 +277,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = __parent_ca(ca); + ca = parent_ca(ca); } rcu_read_unlock(); } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 95585a0b9c8d..57ecb5d2513f 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -59,11 +59,7 @@ static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) static inline struct hugetlb_cgroup * parent_hugetlb_cgroup(struct hugetlb_cgroup *h_cg) { - struct cgroup *parent = h_cg->css.cgroup->parent; - - if (!parent) - return NULL; - return hugetlb_cgroup_from_cgroup(parent); + return hugetlb_cgroup_from_css(css_parent(&h_cg->css)); } static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 11d659e3b08e..69b3e520f921 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1524,10 +1524,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) int mem_cgroup_swappiness(struct mem_cgroup *memcg) { - struct cgroup *cgrp = memcg->css.cgroup; - /* root ? */ - if (cgrp->parent == NULL) + if (!css_parent(&memcg->css)) return vm_swappiness; return memcg->swappiness; @@ -5026,11 +5024,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, { int retval = 0; struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_memcg = NULL; - - if (parent) - parent_memcg = mem_cgroup_from_cont(parent); + struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5282,18 +5276,15 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, unsigned long long *mem_limit, unsigned long long *memsw_limit) { - struct cgroup *cgroup; unsigned long long min_limit, min_memsw_limit, tmp; min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); - cgroup = memcg->css.cgroup; if (!memcg->use_hierarchy) goto out; - while (cgroup->parent) { - cgroup = cgroup->parent; - memcg = mem_cgroup_from_cont(cgroup); + while (css_parent(&memcg->css)) { + memcg = mem_cgroup_from_css(css_parent(&memcg->css)); if (!memcg->use_hierarchy) break; tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -5523,16 +5514,11 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; - - if (val > 100) - return -EINVAL; + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (cgrp->parent == NULL) + if (val > 100 || !parent) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* If under hierarchy, only empty-root can set this value */ @@ -5861,14 +5847,12 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - struct mem_cgroup *parent; + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!cgrp->parent || !((val == 0) || (val == 1))) + if (!parent || !((val == 0) || (val == 1))) return -EINVAL; - parent = mem_cgroup_from_cont(cgrp->parent); - mutex_lock(&memcg_create_mutex); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || memcg_has_children(memcg)) { @@ -6266,15 +6250,14 @@ free_out: static int mem_cgroup_css_online(struct cgroup *cont) { - struct mem_cgroup *memcg, *parent; + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); int error = 0; - if (!cont->parent) + if (!parent) return 0; mutex_lock(&memcg_create_mutex); - memcg = mem_cgroup_from_cont(cont); - parent = mem_cgroup_from_cont(cont->parent); memcg->use_hierarchy = parent->use_hierarchy; memcg->oom_kill_disable = parent->oom_kill_disable; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index af412ab2b477..9e6b75e5efce 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -50,9 +50,11 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) static int cgrp_css_online(struct cgroup *cgrp) { - if (cgrp->parent) - cgrp_cls_state(cgrp)->classid = - cgrp_cls_state(cgrp->parent)->classid; + struct cgroup_cls_state *cs = cgrp_cls_state(cgrp); + struct cgroup_cls_state *parent = css_cls_state(css_parent(&cs->css)); + + if (parent) + cs->classid = parent->classid; return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 90953648c643..635a49db005d 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -198,13 +198,11 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg) */ static int devcgroup_online(struct cgroup *cgroup) { - struct dev_cgroup *dev_cgroup, *parent_dev_cgroup = NULL; + struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(&dev_cgroup->css)); int ret = 0; mutex_lock(&devcgroup_mutex); - dev_cgroup = cgroup_to_devcgroup(cgroup); - if (cgroup->parent) - parent_dev_cgroup = cgroup_to_devcgroup(cgroup->parent); if (parent_dev_cgroup == NULL) dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; @@ -394,12 +392,10 @@ static bool may_access(struct dev_cgroup *dev_cgroup, static int parent_has_perm(struct dev_cgroup *childcg, struct dev_exception_item *ex) { - struct cgroup *pcg = childcg->css.cgroup->parent; - struct dev_cgroup *parent; + struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css)); - if (!pcg) + if (!parent) return 1; - parent = cgroup_to_devcgroup(pcg); return may_access(parent, ex, childcg->behavior); } @@ -524,15 +520,11 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc = 0; struct dev_exception_item ex; - struct cgroup *p = devcgroup->css.cgroup; - struct dev_cgroup *parent = NULL; + struct dev_cgroup *parent = css_to_devcgroup(css_parent(&devcgroup->css)); if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (p->parent) - parent = cgroup_to_devcgroup(p->parent); - memset(&ex, 0, sizeof(ex)); b = buffer; -- cgit v1.3-8-gc7d7 From eb95419b023abacb415e2a18fea899023ce7624d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in subsystem methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup * in subsystem implementations for the following reasons. * With unified hierarchy, subsystems will be dynamically bound and unbound from cgroups and thus css's (cgroup_subsys_state) may be created and destroyed dynamically over the lifetime of a cgroup, which is different from the current state where all css's are allocated and destroyed together with the associated cgroup. This in turn means that cgroup_css() should be synchronized and may return NULL, making it more cumbersome to use. * Differing levels of per-subsystem granularity in the unified hierarchy means that the task and descendant iterators should behave differently depending on the specific subsystem the iteration is being performed for. * In majority of the cases, subsystems only care about its part in the cgroup hierarchy - ie. the hierarchy of css's. Subsystem methods often obtain the matching css pointer from the cgroup and don't bother with the cgroup pointer itself. Passing around css fits much better. This patch converts all cgroup_subsys methods to take @css instead of @cgroup. The conversions are mostly straight-forward. A few noteworthy changes are * ->css_alloc() now takes css of the parent cgroup rather than the pointer to the new cgroup as the css for the new cgroup doesn't exist yet. Knowing the parent css is enough for all the existing subsystems. * In kernel/cgroup.c::offline_css(), unnecessary open coded css dereference is replaced with local variable access. This patch shouldn't cause any behavior differences. v2: Unnecessary explicit cgrp->subsys[] deref in css_online() replaced with local variable @css as suggested by Li Zefan. Rebased on top of new for-3.12 which includes for-3.11-fixes so that ->css_free() invocation added by da0a12caff ("cgroup: fix a leak when percpu_ref_init() fails") is converted too. Suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- block/blk-cgroup.c | 25 +++++++++++---------- include/linux/cgroup.h | 24 +++++++++++--------- kernel/cgroup.c | 57 ++++++++++++++++++++++++++++------------------- kernel/cgroup_freezer.c | 40 +++++++++++++++++---------------- kernel/cpuset.c | 39 +++++++++++++++++--------------- kernel/events/core.c | 18 ++++++++------- kernel/sched/core.c | 39 ++++++++++++++++---------------- kernel/sched/cpuacct.c | 9 ++++---- mm/hugetlb_cgroup.c | 19 ++++++++-------- mm/memcontrol.c | 38 +++++++++++++++---------------- net/core/netprio_cgroup.c | 20 ++++++++--------- net/sched/cls_cgroup.c | 18 ++++++++------- security/device_cgroup.c | 22 +++++++++--------- 13 files changed, 197 insertions(+), 171 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 290792a13e3c..79fd9f4fadb7 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -765,18 +765,18 @@ struct cftype blkcg_files[] = { /** * blkcg_css_offline - cgroup css_offline callback - * @cgroup: cgroup of interest + * @css: css of interest * - * This function is called when @cgroup is about to go away and responsible - * for shooting down all blkgs associated with @cgroup. blkgs should be + * This function is called when @css is about to go away and responsible + * for shooting down all blkgs associated with @css. blkgs should be * removed while holding both q and blkcg locks. As blkcg lock is nested * inside q lock, this function performs reverse double lock dancing. * * This is the blkcg counterpart of ioc_release_fn(). */ -static void blkcg_css_offline(struct cgroup *cgroup) +static void blkcg_css_offline(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); spin_lock_irq(&blkcg->lock); @@ -798,21 +798,21 @@ static void blkcg_css_offline(struct cgroup *cgroup) spin_unlock_irq(&blkcg->lock); } -static void blkcg_css_free(struct cgroup *cgroup) +static void blkcg_css_free(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); if (blkcg != &blkcg_root) kfree(blkcg); } -static struct cgroup_subsys_state *blkcg_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { static atomic64_t id_seq = ATOMIC64_INIT(0); struct blkcg *blkcg; - struct cgroup *parent = cgroup->parent; - if (!parent) { + if (!parent_css) { blkcg = &blkcg_root; goto done; } @@ -883,14 +883,15 @@ void blkcg_exit_queue(struct request_queue *q) * of the main cic data structures. For now we allow a task to change * its cgroup only if it's the only owner of its ioc. */ -static int blkcg_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int blkcg_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; struct io_context *ioc; int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 18112a3bb12b..9c2b9dd9121d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -579,18 +579,22 @@ int cgroup_taskset_size(struct cgroup_taskset *tset); */ struct cgroup_subsys { - struct cgroup_subsys_state *(*css_alloc)(struct cgroup *cgrp); - int (*css_online)(struct cgroup *cgrp); - void (*css_offline)(struct cgroup *cgrp); - void (*css_free)(struct cgroup *cgrp); - - int (*can_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); - void (*cancel_attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); - void (*attach)(struct cgroup *cgrp, struct cgroup_taskset *tset); + struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); + int (*css_online)(struct cgroup_subsys_state *css); + void (*css_offline)(struct cgroup_subsys_state *css); + void (*css_free)(struct cgroup_subsys_state *css); + + int (*can_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*cancel_attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); + void (*attach)(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset); void (*fork)(struct task_struct *task); - void (*exit)(struct cgroup *cgrp, struct cgroup *old_cgrp, + void (*exit)(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task); - void (*bind)(struct cgroup *root); + void (*bind)(struct cgroup_subsys_state *root_css); int subsys_id; int disabled; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4234428f1014..271d9a5cde5f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -853,8 +853,11 @@ static void cgroup_free_fn(struct work_struct *work) /* * Release the subsystem state objects. */ - for_each_root_subsys(cgrp->root, ss) - ss->css_free(cgrp); + for_each_root_subsys(cgrp->root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + + ss->css_free(css); + } cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -1056,7 +1059,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp); + ss->bind(cgrp->subsys[i]); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; @@ -1066,7 +1069,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(cgrp->subsys[i]->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top); + ss->bind(cgroup_dummy_top->subsys[i]); cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -2049,8 +2052,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->can_attach) { - retval = ss->can_attach(cgrp, &tset); + retval = ss->can_attach(css, &tset); if (retval) { failed_ss = ss; goto out_cancel_attach; @@ -2089,8 +2094,10 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss->attach) - ss->attach(cgrp, &tset); + ss->attach(css, &tset); } /* @@ -2109,10 +2116,12 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + if (ss == failed_ss) break; if (ss->cancel_attach) - ss->cancel_attach(cgrp, &tset); + ss->cancel_attach(css, &tset); } } out_free_group_list: @@ -4206,14 +4215,15 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { + struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; int ret = 0; lockdep_assert_held(&cgroup_mutex); if (ss->css_online) - ret = ss->css_online(cgrp); + ret = ss->css_online(css); if (!ret) - cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; + css->flags |= CSS_ONLINE; return ret; } @@ -4228,9 +4238,9 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) return; if (ss->css_offline) - ss->css_offline(cgrp); + ss->css_offline(css); - cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; + css->flags &= ~CSS_ONLINE; } /* @@ -4305,7 +4315,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgrp); + css = ss->css_alloc(parent->subsys[ss->subsys_id]); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4313,7 +4323,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = percpu_ref_init(&css->refcnt, css_release); if (err) { - ss->css_free(cgrp); + ss->css_free(css); goto err_free_all; } @@ -4386,7 +4396,7 @@ err_free_all: if (css) { percpu_ref_cancel_init(&css->refcnt); - ss->css_free(cgrp); + ss->css_free(css); } } mutex_unlock(&cgroup_mutex); @@ -4641,7 +4651,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4720,7 +4730,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top); + css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4836,7 +4846,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top); + ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5192,10 +5202,10 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) */ for_each_builtin_subsys(ss, i) { if (ss->exit) { - struct cgroup *old_cgrp = cset->subsys[i]->cgroup; - struct cgroup *cgrp = task_cgroup(tsk, i); + struct cgroup_subsys_state *old_css = cset->subsys[i]; + struct cgroup_subsys_state *css = task_css(tsk, i); - ss->exit(cgrp, old_cgrp, tsk); + ss->exit(css, old_css, tsk); } } } @@ -5529,7 +5539,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) } #ifdef CONFIG_CGROUP_DEBUG -static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +debug_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); @@ -5539,9 +5550,9 @@ static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cgrp) return css; } -static void debug_css_free(struct cgroup *cgrp) +static void debug_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp->subsys[debug_subsys_id]); + kfree(css); } static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 657a73cd44c4..f03a85719c3c 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -91,7 +91,8 @@ static const char *freezer_state_strs(unsigned int state) struct cgroup_subsys freezer_subsys; -static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +freezer_css_alloc(struct cgroup_subsys_state *parent_css) { struct freezer *freezer; @@ -104,16 +105,16 @@ static struct cgroup_subsys_state *freezer_css_alloc(struct cgroup *cgroup) } /** - * freezer_css_online - commit creation of a freezer cgroup - * @cgroup: cgroup being created + * freezer_css_online - commit creation of a freezer css + * @css: css being created * - * We're committing to creation of @cgroup. Mark it online and inherit + * We're committing to creation of @css. Mark it online and inherit * parent's freezing state while holding both parent's and our * freezer->lock. */ -static int freezer_css_online(struct cgroup *cgroup) +static int freezer_css_online(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct freezer *parent = parent_freezer(freezer); /* @@ -140,15 +141,15 @@ static int freezer_css_online(struct cgroup *cgroup) } /** - * freezer_css_offline - initiate destruction of @cgroup - * @cgroup: cgroup being destroyed + * freezer_css_offline - initiate destruction of a freezer css + * @css: css being destroyed * - * @cgroup is going away. Mark it dead and decrement system_freezing_count - * if it was holding one. + * @css is going away. Mark it dead and decrement system_freezing_count if + * it was holding one. */ -static void freezer_css_offline(struct cgroup *cgroup) +static void freezer_css_offline(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); spin_lock_irq(&freezer->lock); @@ -160,9 +161,9 @@ static void freezer_css_offline(struct cgroup *cgroup) spin_unlock_irq(&freezer->lock); } -static void freezer_css_free(struct cgroup *cgroup) +static void freezer_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_freezer(cgroup)); + kfree(css_freezer(css)); } /* @@ -174,25 +175,26 @@ static void freezer_css_free(struct cgroup *cgroup) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup *new_cgrp, struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_subsys_state *new_css, + struct cgroup_taskset *tset) { - struct freezer *freezer = cgroup_freezer(new_cgrp); + struct freezer *freezer = css_freezer(new_css); struct task_struct *task; bool clear_frozen = false; spin_lock_irq(&freezer->lock); /* - * Make the new tasks conform to the current state of @new_cgrp. + * Make the new tasks conform to the current state of @new_css. * For simplicity, when migrating any task to a FROZEN cgroup, we * revert it to FREEZING and let update_if_frozen() determine the * correct state later. * - * Tasks in @tset are on @new_cgrp but may not conform to its + * Tasks in @tset are on @new_css but may not conform to its * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_cgrp, tset) { + cgroup_taskset_for_each(task, new_css->cgroup, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 259a4af37e69..8ce3fdc3dfcc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1455,9 +1455,10 @@ static int fmeter_getrate(struct fmeter *fmp) } /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct task_struct *task; int ret; @@ -1468,11 +1469,11 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) * flag is set. */ ret = -ENOSPC; - if (!cgroup_sane_behavior(cgrp) && + if (!cgroup_sane_behavior(css->cgroup) && (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1501,11 +1502,11 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup *cgrp, +static void cpuset_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mutex_lock(&cpuset_mutex); - cgroup_cs(cgrp)->attach_in_progress--; + css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); } @@ -1516,7 +1517,8 @@ static void cpuset_cancel_attach(struct cgroup *cgrp, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; @@ -1524,7 +1526,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *oldcs = cgroup_cs(oldcgrp); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1539,7 +1541,7 @@ static void cpuset_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1940,11 +1942,12 @@ static struct cftype files[] = { * cgrp: control group that the new cpuset will be part of */ -static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuset_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuset *cs; - if (!cgrp->parent) + if (!parent_css) return &top_cpuset.css; cs = kzalloc(sizeof(*cs), GFP_KERNEL); @@ -1964,9 +1967,9 @@ static struct cgroup_subsys_state *cpuset_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cpuset_css_online(struct cgroup *cgrp) +static int cpuset_css_online(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; struct cgroup *pos_cgrp; @@ -1984,7 +1987,7 @@ static int cpuset_css_online(struct cgroup *cgrp) number_of_cpusets++; - if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags)) + if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; /* @@ -2024,9 +2027,9 @@ out_unlock: * will call rebuild_sched_domains_locked(). */ -static void cpuset_css_offline(struct cgroup *cgrp) +static void cpuset_css_offline(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -2039,9 +2042,9 @@ static void cpuset_css_offline(struct cgroup *cgrp) mutex_unlock(&cpuset_mutex); } -static void cpuset_css_free(struct cgroup *cgrp) +static void cpuset_css_free(struct cgroup_subsys_state *css) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); free_cpumask_var(cs->cpus_allowed); kfree(cs); diff --git a/kernel/events/core.c b/kernel/events/core.c index 414c61f4d776..9705a0ed1dce 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7778,7 +7778,8 @@ unlock: device_initcall(perf_event_sysfs_init); #ifdef CONFIG_CGROUP_PERF -static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) +static struct cgroup_subsys_state * +perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct perf_cgroup *jc; @@ -7795,11 +7796,10 @@ static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont) return &jc->css; } -static void perf_cgroup_css_free(struct cgroup *cont) +static void perf_cgroup_css_free(struct cgroup_subsys_state *css) { - struct perf_cgroup *jc; - jc = container_of(cgroup_css(cont, perf_subsys_id), - struct perf_cgroup, css); + struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css); + free_percpu(jc->info); kfree(jc); } @@ -7811,15 +7811,17 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css->cgroup, tset) task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, +static void perf_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, struct task_struct *task) { /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7a10742b389a..622b7efc5ade 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7094,16 +7094,17 @@ static inline struct task_group *cgroup_tg(struct cgroup *cgrp) return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); } -static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { - struct task_group *tg, *parent; + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; - if (!cgrp->parent) { + if (!parent) { /* This is early initialization for the top cgroup */ return &root_task_group.css; } - parent = cgroup_tg(cgrp->parent); tg = sched_create_group(parent); if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); @@ -7111,38 +7112,38 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp) return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup *cgrp) +static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); - struct task_group *parent = css_tg(css_parent(&tg->css)); + struct task_group *tg = css_tg(css); + struct task_group *parent = css_tg(css_parent(css)); if (parent) sched_online_group(tg, parent); return 0; } -static void cpu_cgroup_css_free(struct cgroup *cgrp) +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_destroy_group(tg); } -static void cpu_cgroup_css_offline(struct cgroup *cgrp) +static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); sched_offline_group(tg); } -static int cpu_cgroup_can_attach(struct cgroup *cgrp, +static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) { + cgroup_taskset_for_each(task, css->cgroup, tset) { #ifdef CONFIG_RT_GROUP_SCHED - if (!sched_rt_can_attach(cgroup_tg(cgrp), task)) + if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; #else /* We don't support RT-tasks being in separate groups */ @@ -7153,18 +7154,18 @@ static int cpu_cgroup_can_attach(struct cgroup *cgrp, return 0; } -static void cpu_cgroup_attach(struct cgroup *cgrp, +static void cpu_cgroup_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *task; - cgroup_taskset_for_each(task, cgrp, tset) + cgroup_taskset_for_each(task, css->cgroup, tset) sched_move_task(task); } -static void -cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp, - struct task_struct *task) +static void cpu_cgroup_exit(struct cgroup_subsys_state *css, + struct cgroup_subsys_state *old_css, + struct task_struct *task) { /* * cgroup_exit() is called in the copy_process() failure path. diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index f6926a149a71..1b784d9b3630 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -62,11 +62,12 @@ static struct cpuacct root_cpuacct = { }; /* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) { struct cpuacct *ca; - if (!cgrp->parent) + if (!parent_css) return &root_cpuacct.css; ca = kzalloc(sizeof(*ca), GFP_KERNEL); @@ -92,9 +93,9 @@ out: } /* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) +static void cpuacct_css_free(struct cgroup_subsys_state *css) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); free_percpu(ca->cpustat); free_percpu(ca->cpuusage); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 57ecb5d2513f..e2132435060f 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -73,19 +73,18 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg) return false; } -static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { + struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); + struct hugetlb_cgroup *h_cgroup; int idx; - struct cgroup *parent_cgroup; - struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); - parent_cgroup = cgroup->parent; - if (parent_cgroup) { - parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); + if (parent_h_cgroup) { for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) res_counter_init(&h_cgroup->hugepage[idx], &parent_h_cgroup->hugepage[idx]); @@ -97,11 +96,11 @@ static struct cgroup_subsys_state *hugetlb_cgroup_css_alloc(struct cgroup *cgrou return &h_cgroup->css; } -static void hugetlb_cgroup_css_free(struct cgroup *cgroup) +static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { struct hugetlb_cgroup *h_cgroup; - h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); + h_cgroup = hugetlb_cgroup_from_css(css); kfree(h_cgroup); } @@ -150,9 +149,9 @@ out: * Force the hugetlb cgroup to empty the hugetlb resources by moving them to * the parent cgroup. */ -static void hugetlb_cgroup_css_offline(struct cgroup *cgroup) +static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); struct hstate *h; struct page *page; int idx = 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 69b3e520f921..32cca0f0af0d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6211,7 +6211,7 @@ static void __init mem_cgroup_soft_limit_tree_init(void) } static struct cgroup_subsys_state * __ref -mem_cgroup_css_alloc(struct cgroup *cont) +mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct mem_cgroup *memcg; long error = -ENOMEM; @@ -6226,7 +6226,7 @@ mem_cgroup_css_alloc(struct cgroup *cont) goto free_out; /* root ? */ - if (cont->parent == NULL) { + if (parent_css == NULL) { root_mem_cgroup = memcg; res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); @@ -6248,10 +6248,10 @@ free_out: } static int -mem_cgroup_css_online(struct cgroup *cont) +mem_cgroup_css_online(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; if (!parent) @@ -6308,9 +6308,9 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) mem_cgroup_iter_invalidate(root_mem_cgroup); } -static void mem_cgroup_css_offline(struct cgroup *cont) +static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); kmem_cgroup_css_offline(memcg); @@ -6319,9 +6319,9 @@ static void mem_cgroup_css_offline(struct cgroup *cont) mem_cgroup_destroy_all_caches(memcg); } -static void mem_cgroup_css_free(struct cgroup *cont) +static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg_destroy_kmem(memcg); __mem_cgroup_free(memcg); @@ -6691,12 +6691,12 @@ static void mem_cgroup_clear_mc(void) mem_cgroup_end_move(from); } -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); int ret = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long move_charge_at_immigrate; /* @@ -6738,7 +6738,7 @@ static int mem_cgroup_can_attach(struct cgroup *cgroup, return ret; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { mem_cgroup_clear_mc(); @@ -6886,7 +6886,7 @@ retry: up_read(&mm->mmap_sem); } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { struct task_struct *p = cgroup_taskset_first(tset); @@ -6901,16 +6901,16 @@ static void mem_cgroup_move_task(struct cgroup *cont, mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -static int mem_cgroup_can_attach(struct cgroup *cgroup, +static int mem_cgroup_can_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { return 0; } -static void mem_cgroup_cancel_attach(struct cgroup *cgroup, +static void mem_cgroup_cancel_attach(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } -static void mem_cgroup_move_task(struct cgroup *cont, +static void mem_cgroup_move_task(struct cgroup_subsys_state *css, struct cgroup_taskset *tset) { } @@ -6920,15 +6920,15 @@ static void mem_cgroup_move_task(struct cgroup *cont, * Cgroup retains root cgroups across [un]mount cycles making it necessary * to verify sane_behavior flag on each mount attempt. */ -static void mem_cgroup_bind(struct cgroup *root) +static void mem_cgroup_bind(struct cgroup_subsys_state *root_css) { /* * use_hierarchy is forced with sane_behavior. cgroup core * guarantees that @root doesn't have any children, so turning it * on for the root memcg is enough. */ - if (cgroup_sane_behavior(root)) - mem_cgroup_from_cont(root)->use_hierarchy = true; + if (cgroup_sane_behavior(root_css->cgroup)) + mem_cgroup_from_css(root_css)->use_hierarchy = true; } struct cgroup_subsys mem_cgroup_subsys = { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 5dfac8886e12..8d095b4c2f6f 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -126,7 +126,8 @@ static int netprio_set_prio(struct cgroup_subsys_state *css, return 0; } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_subsys_state *css; @@ -137,16 +138,14 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, net_prio_subsys_id); - struct cgroup_subsys_state *parent_css; + struct cgroup_subsys_state *parent_css = css_parent(css); struct net_device *dev; int ret = 0; - if (!cgrp->parent) + if (!parent_css) return 0; - parent_css = cgroup_css(cgrp->parent, net_prio_subsys_id); rtnl_lock(); /* @@ -164,9 +163,9 @@ static int cgrp_css_online(struct cgroup *cgrp) return ret; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgroup_css(cgrp, net_prio_subsys_id)); + kfree(css); } static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) @@ -221,12 +220,13 @@ static int update_netprio(const void *v, struct file *file, unsigned n) return 0; } -static void net_prio_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void net_prio_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css->cgroup, tset) { task_lock(p); v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 9e6b75e5efce..dc3983835893 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -38,7 +38,8 @@ static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) return css_cls_state(task_css(p, net_cls_subsys_id)); } -static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) +static struct cgroup_subsys_state * +cgrp_css_alloc(struct cgroup_subsys_state *parent_css) { struct cgroup_cls_state *cs; @@ -48,19 +49,19 @@ static struct cgroup_subsys_state *cgrp_css_alloc(struct cgroup *cgrp) return &cs->css; } -static int cgrp_css_online(struct cgroup *cgrp) +static int cgrp_css_online(struct cgroup_subsys_state *css) { - struct cgroup_cls_state *cs = cgrp_cls_state(cgrp); - struct cgroup_cls_state *parent = css_cls_state(css_parent(&cs->css)); + struct cgroup_cls_state *cs = css_cls_state(css); + struct cgroup_cls_state *parent = css_cls_state(css_parent(css)); if (parent) cs->classid = parent->classid; return 0; } -static void cgrp_css_free(struct cgroup *cgrp) +static void cgrp_css_free(struct cgroup_subsys_state *css) { - kfree(cgrp_cls_state(cgrp)); + kfree(css_cls_state(css)); } static int update_classid(const void *v, struct file *file, unsigned n) @@ -72,12 +73,13 @@ static int update_classid(const void *v, struct file *file, unsigned n) return 0; } -static void cgrp_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +static void cgrp_attach(struct cgroup_subsys_state *css, + struct cgroup_taskset *tset) { struct task_struct *p; void *v; - cgroup_taskset_for_each(p, cgrp, tset) { + cgroup_taskset_for_each(p, css->cgroup, tset) { task_lock(p); v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 635a49db005d..7293ac49ba7b 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -68,7 +68,7 @@ static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) struct cgroup_subsys devices_subsys; -static int devcgroup_can_attach(struct cgroup *new_cgrp, +static int devcgroup_can_attach(struct cgroup_subsys_state *new_css, struct cgroup_taskset *set) { struct task_struct *task = cgroup_taskset_first(set); @@ -193,13 +193,13 @@ static inline bool is_devcg_online(const struct dev_cgroup *devcg) /** * devcgroup_online - initializes devcgroup's behavior and exceptions based on * parent's - * @cgroup: cgroup getting online + * @css: css getting online * returns 0 in case of success, error code otherwise */ -static int devcgroup_online(struct cgroup *cgroup) +static int devcgroup_online(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); - struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(&dev_cgroup->css)); + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); + struct dev_cgroup *parent_dev_cgroup = css_to_devcgroup(css_parent(css)); int ret = 0; mutex_lock(&devcgroup_mutex); @@ -217,9 +217,9 @@ static int devcgroup_online(struct cgroup *cgroup) return ret; } -static void devcgroup_offline(struct cgroup *cgroup) +static void devcgroup_offline(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); mutex_lock(&devcgroup_mutex); dev_cgroup->behavior = DEVCG_DEFAULT_NONE; @@ -229,7 +229,8 @@ static void devcgroup_offline(struct cgroup *cgroup) /* * called from kernel/cgroup.c with cgroup_lock() held. */ -static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) +static struct cgroup_subsys_state * +devcgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct dev_cgroup *dev_cgroup; @@ -242,11 +243,10 @@ static struct cgroup_subsys_state *devcgroup_css_alloc(struct cgroup *cgroup) return &dev_cgroup->css; } -static void devcgroup_css_free(struct cgroup *cgroup) +static void devcgroup_css_free(struct cgroup_subsys_state *css) { - struct dev_cgroup *dev_cgroup; + struct dev_cgroup *dev_cgroup = css_to_devcgroup(css); - dev_cgroup = cgroup_to_devcgroup(cgroup); __dev_exception_clean(dev_cgroup); kfree(dev_cgroup); } -- cgit v1.3-8-gc7d7 From 2bb566cb68dfafad328af666ebadf0e49accd6ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: add subsys backlink pointer to cftype cgroup is transitioning to using css (cgroup_subsys_state) instead of cgroup as the primary subsystem handle. The cgroupfs file interface will be converted to use css's which requires finding out the subsystem from cftype so that the matching css can be determined from the cgroup. This patch adds cftype->ss which points to the subsystem the file belongs to. The field is initialized while a cftype is being registered. This makes it unnecessary to explicitly specify the subsystem for other cftype handling functions. @ss argument dropped from various cftype handling functions. This patch shouldn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Cc: Jens Axboe --- block/blk-cgroup.c | 2 +- include/linux/cgroup.h | 8 +++++- kernel/cgroup.c | 78 ++++++++++++++++++++++++++++---------------------- 3 files changed, 51 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 79fd9f4fadb7..34063739745b 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1128,7 +1128,7 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* kill the intf files first */ if (pol->cftypes) - cgroup_rm_cftypes(&blkio_subsys, pol->cftypes); + cgroup_rm_cftypes(pol->cftypes); /* unregister and update blkgs */ blkcg_policy[pol->plid] = NULL; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 9c2b9dd9121d..5db8138a0482 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -429,6 +429,12 @@ struct cftype { /* CFTYPE_* flags */ unsigned int flags; + /* + * The subsys this file belongs to. Initialized automatically + * during registration. NULL for cgroup core files. + */ + struct cgroup_subsys *ss; + int (*open)(struct inode *inode, struct file *file); ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, struct file *file, @@ -542,7 +548,7 @@ static inline const char *cgroup_name(const struct cgroup *cgrp) } int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); +int cgroup_rm_cftypes(struct cftype *cfts); bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 271d9a5cde5f..c4bc8dac3b1d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -219,8 +219,8 @@ static struct cftype cgroup_base_files[]; static void cgroup_offline_fn(struct work_struct *work); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add); +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add); /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) @@ -974,7 +974,7 @@ static void cgroup_clear_dir(struct cgroup *cgrp, unsigned long subsys_mask) if (!test_bit(i, &subsys_mask)) continue; list_for_each_entry(set, &ss->cftsets, node) - cgroup_addrm_files(cgrp, NULL, set->cfts, false); + cgroup_addrm_files(cgrp, set->cfts, false); } } @@ -1623,7 +1623,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, */ cred = override_creds(&init_cred); - ret = cgroup_addrm_files(root_cgrp, NULL, cgroup_base_files, true); + ret = cgroup_addrm_files(root_cgrp, cgroup_base_files, true); if (ret) goto rm_base_files; @@ -1681,7 +1681,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, rm_base_files: free_cgrp_cset_links(&tmp_links); - cgroup_addrm_files(&root->top_cgroup, NULL, cgroup_base_files, false); + cgroup_addrm_files(&root->top_cgroup, cgroup_base_files, false); revert_creds(cred); unlock_drop: cgroup_exit_root_id(root); @@ -2694,8 +2694,7 @@ static umode_t cgroup_file_mode(const struct cftype *cft) return mode; } -static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype *cft) +static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) { struct dentry *dir = cgrp->dentry; struct cgroup *parent = __d_cgrp(dir); @@ -2705,8 +2704,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (subsys && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { - strcpy(name, subsys->name); + if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + strcpy(name, cft->ss->name); strcat(name, "."); } strcat(name, cft->name); @@ -2743,17 +2742,16 @@ out: /** * cgroup_addrm_files - add or remove files to a cgroup directory * @cgrp: the target cgroup - * @subsys: the subsystem of files to be added * @cfts: array of cftypes to be added * @is_add: whether to add or remove * * Depending on @is_add, add or remove files defined by @cfts on @cgrp. - * All @cfts should belong to @subsys. For removals, this function never - * fails. If addition fails, this function doesn't remove files already - * added. The caller is responsible for cleaning up. + * For removals, this function never fails. If addition fails, this + * function doesn't remove files already added. The caller is responsible + * for cleaning up. */ -static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, - struct cftype cfts[], bool is_add) +static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], + bool is_add) { struct cftype *cft; int ret; @@ -2771,7 +2769,7 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, continue; if (is_add) { - ret = cgroup_add_file(cgrp, subsys, cft); + ret = cgroup_add_file(cgrp, cft); if (ret) { pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", cft->name, ret); @@ -2796,11 +2794,11 @@ static void cgroup_cfts_prepare(void) mutex_lock(&cgroup_mutex); } -static int cgroup_cfts_commit(struct cgroup_subsys *ss, - struct cftype *cfts, bool is_add) +static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) __releases(&cgroup_mutex) { LIST_HEAD(pending); + struct cgroup_subsys *ss = cfts[0].ss; struct cgroup *cgrp, *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; @@ -2828,7 +2826,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, inode = root->dentry->d_inode; mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, ss, cfts, is_add); + ret = cgroup_addrm_files(root, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2851,7 +2849,7 @@ static int cgroup_cfts_commit(struct cgroup_subsys *ss, mutex_lock(&inode->i_mutex); mutex_lock(&cgroup_mutex); if (cgrp->serial_nr < update_before && !cgroup_is_dead(cgrp)) - ret = cgroup_addrm_files(cgrp, ss, cfts, is_add); + ret = cgroup_addrm_files(cgrp, cfts, is_add); mutex_unlock(&cgroup_mutex); mutex_unlock(&inode->i_mutex); @@ -2883,51 +2881,56 @@ out_deact: int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) { struct cftype_set *set; + struct cftype *cft; int ret; set = kzalloc(sizeof(*set), GFP_KERNEL); if (!set) return -ENOMEM; + for (cft = cfts; cft->name[0] != '\0'; cft++) + cft->ss = ss; + cgroup_cfts_prepare(); set->cfts = cfts; list_add_tail(&set->node, &ss->cftsets); - ret = cgroup_cfts_commit(ss, cfts, true); + ret = cgroup_cfts_commit(cfts, true); if (ret) - cgroup_rm_cftypes(ss, cfts); + cgroup_rm_cftypes(cfts); return ret; } EXPORT_SYMBOL_GPL(cgroup_add_cftypes); /** * cgroup_rm_cftypes - remove an array of cftypes from a subsystem - * @ss: target cgroup subsystem * @cfts: zero-length name terminated array of cftypes * - * Unregister @cfts from @ss. Files described by @cfts are removed from - * all existing cgroups to which @ss is attached and all future cgroups - * won't have them either. This function can be called anytime whether @ss - * is attached or not. + * Unregister @cfts. Files described by @cfts are removed from all + * existing cgroups and all future cgroups won't have them either. This + * function can be called anytime whether @cfts' subsys is attached or not. * * Returns 0 on successful unregistration, -ENOENT if @cfts is not - * registered with @ss. + * registered. */ -int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) +int cgroup_rm_cftypes(struct cftype *cfts) { struct cftype_set *set; + if (!cfts || !cfts[0].ss) + return -ENOENT; + cgroup_cfts_prepare(); - list_for_each_entry(set, &ss->cftsets, node) { + list_for_each_entry(set, &cfts[0].ss->cftsets, node) { if (set->cfts == cfts) { list_del(&set->node); kfree(set); - cgroup_cfts_commit(ss, cfts, false); + cgroup_cfts_commit(cfts, false); return 0; } } - cgroup_cfts_commit(ss, NULL, false); + cgroup_cfts_commit(NULL, false); return -ENOENT; } @@ -4148,7 +4151,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) continue; list_for_each_entry(set, &ss->cftsets, node) { - ret = cgroup_addrm_files(cgrp, ss, set->cfts, true); + ret = cgroup_addrm_files(cgrp, set->cfts, true); if (ret < 0) goto err; } @@ -4377,7 +4380,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - err = cgroup_addrm_files(cgrp, NULL, cgroup_base_files, true); + err = cgroup_addrm_files(cgrp, cgroup_base_files, true); if (err) goto err_destroy; @@ -4538,7 +4541,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * but we aren't quite done with @cgrp yet, so hold onto it. */ cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); - cgroup_addrm_files(cgrp, NULL, cgroup_base_files, false); + cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); @@ -4632,6 +4635,11 @@ static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) * deregistration. */ if (ss->base_cftypes) { + struct cftype *cft; + + for (cft = ss->base_cftypes; cft->name[0] != '\0'; cft++) + cft->ss = ss; + ss->base_cftset.cfts = ss->base_cftypes; list_add_tail(&ss->base_cftset.node, &ss->cftsets); } -- cgit v1.3-8-gc7d7 From f7d58818ba4249f04a83b73aaac135640050bb4f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:23 -0400 Subject: cgroup: pin cgroup_subsys_state when opening a cgroupfs file Previously, each file read/write operation relied on the inode reference count pinning the cgroup and simply checked whether the cgroup was marked dead before proceeding to invoke the per-subsystem callback. This was rather silly as it didn't have any synchronization or css pinning around the check and the cgroup may be removed and all css refs drained between the DEAD check and actual method invocation. This patch pins the css between open() and release() so that it is guaranteed to be alive for all file operations and remove the silly DEAD checks from cgroup_file_read/write(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 43 ++++++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c4bc8dac3b1d..583f8f66a7e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2277,6 +2277,17 @@ static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, return 0; } +/* return the css for the given cgroup file */ +static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) +{ + struct cftype *cft = cfe->type; + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + + if (cft->ss) + return cgrp->subsys[cft->ss->subsys_id]; + return NULL; +} + /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2354,8 +2365,6 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; if (cft->write) return cft->write(cgrp, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) @@ -2399,9 +2408,6 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, struct cftype *cft = __d_cft(file->f_dentry); struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); - if (cgroup_is_dead(cgrp)) - return -ENODEV; - if (cft->read) return cft->read(cgrp, cft, file, buf, nbytes, ppos); if (cft->read_u64) @@ -2447,15 +2453,22 @@ static const struct file_operations cgroup_seqfile_operations = { static int cgroup_file_open(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); + struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); int err; - struct cfent *cfe; - struct cftype *cft; err = generic_file_open(inode, file); if (err) return err; - cfe = __d_cfe(file->f_dentry); - cft = cfe->type; + + /* + * If the file belongs to a subsystem, pin the css. Will be + * unpinned either on open failure or release. This ensures that + * @css stays alive for all file operations. + */ + if (css && !css_tryget(css)) + return -ENODEV; if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; @@ -2464,15 +2477,23 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } + if (css && err) + css_put(css); return err; } static int cgroup_file_release(struct inode *inode, struct file *file) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); + int ret = 0; + if (cft->release) - return cft->release(inode, file); - return 0; + ret = cft->release(inode, file); + if (css) + css_put(css); + return ret; } /* -- cgit v1.3-8-gc7d7 From 67f4c36f83455b253445b2cb28ac9a2c4f85d99a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: add cgroup->dummy_css cgroup subsystem API is being converted to use css (cgroup_subsys_state) as the main handle, which makes things a bit awkward for subsystem agnostic core features - the "cgroup.*" interface files and various iterations - a bit awkward as they don't have a css to use. This patch adds cgroup->dummy_css which has NULL ->ss and whose only role is pointing back to the cgroup. This will be used to support subsystem agnostic features on the coming css based API. css_parent() is updated to handle dummy_css's. Note that css will soon grow its own ->parent field and css_parent() will be made trivial. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 11 ++++++++++- kernel/cgroup.c | 9 +++++---- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5db8138a0482..b0d5f53ae5e1 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -225,6 +225,9 @@ struct cgroup { struct list_head pidlists; struct mutex pidlist_mutex; + /* dummy css with NULL ->ss, points back to this cgroup */ + struct cgroup_subsys_state dummy_css; + /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; @@ -668,7 +671,13 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) { struct cgroup *parent_cgrp = css->cgroup->parent; - return parent_cgrp ? parent_cgrp->subsys[css->ss->subsys_id] : NULL; + if (!parent_cgrp) + return NULL; + + if (css->ss) + return parent_cgrp->subsys[css->ss->subsys_id]; + else + return &parent_cgrp->dummy_css; } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 583f8f66a7e1..c049992f1ffa 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1365,6 +1365,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + cgrp->dummy_css.cgroup = cgrp; INIT_LIST_HEAD(&cgrp->event_list); spin_lock_init(&cgrp->event_list_lock); simple_xattrs_init(&cgrp->xattrs); @@ -2285,7 +2286,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) if (cft->ss) return cgrp->subsys[cft->ss->subsys_id]; - return NULL; + return &cgrp->dummy_css; } /* A buffer size big enough for numbers or short strings */ @@ -2467,7 +2468,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css && !css_tryget(css)) + if (css->ss && !css_tryget(css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2477,7 +2478,7 @@ static int cgroup_file_open(struct inode *inode, struct file *file) err = cft->open(inode, file); } - if (css && err) + if (css->ss && err) css_put(css); return err; } @@ -2491,7 +2492,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) if (cft->release) ret = cft->release(inode, file); - if (css) + if (css->ss) css_put(css); return ret; } -- cgit v1.3-8-gc7d7 From 182446d087906de40e514573a92a97b203695f71 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: pass around cgroup_subsys_state instead of cgroup in file methods cgroup is currently in the process of transitioning to using struct cgroup_subsys_state * as the primary handle instead of struct cgroup. Please see the previous commit which converts the subsystem methods for rationale. This patch converts all cftype file operations to take @css instead of @cgroup. cftypes for the cgroup core files don't have their subsytem pointer set. These will automatically use the dummy_css added by the previous patch and can be converted the same way. Most subsystem conversions are straight forwards but there are some interesting ones. * freezer: update_if_frozen() is also converted to take @css instead of @cgroup for consistency. This will make the code look simpler too once iterators are converted to use css. * memory/vmpressure: mem_cgroup_from_css() needs to be exported to vmpressure while mem_cgroup_from_cont() can be made static. Updated accordingly. * cpu: cgroup_tg() doesn't have any user left. Removed. * cpuacct: cgroup_ca() doesn't have any user left. Removed. * hugetlb: hugetlb_cgroup_form_cgroup() doesn't have any user left. Removed. * net_cls: cgrp_cls_state() doesn't have any user left. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Daniel Wagner Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe Cc: Steven Rostedt --- block/blk-cgroup.c | 6 +- block/blk-throttle.c | 32 ++++----- block/cfq-iosched.c | 90 ++++++++++++------------- include/linux/cgroup.h | 24 ++++--- include/linux/memcontrol.h | 2 +- kernel/cgroup.c | 162 +++++++++++++++++++++++---------------------- kernel/cgroup_freezer.c | 40 +++++------ kernel/cpuset.c | 35 +++++----- kernel/sched/core.c | 65 +++++++++--------- kernel/sched/cpuacct.c | 28 +++----- mm/hugetlb_cgroup.c | 26 +++----- mm/memcontrol.c | 88 ++++++++++++------------ mm/vmpressure.c | 4 +- net/core/netprio_cgroup.c | 10 ++- net/ipv4/tcp_memcontrol.c | 12 ++-- net/sched/cls_cgroup.c | 14 ++-- security/device_cgroup.c | 12 ++-- 17 files changed, 322 insertions(+), 328 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 34063739745b..f46f3c69179c 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -437,10 +437,10 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl, return &blkg->rl; } -static int blkcg_reset_stats(struct cgroup *cgroup, struct cftype *cftype, - u64 val) +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) { - struct blkcg *blkcg = cgroup_to_blkcg(cgroup); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; int i; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 08a32dfd3844..88bcfb651b0b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1293,10 +1293,10 @@ static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &rwstat); } -static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_cpu_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, cft->private, true); @@ -1325,26 +1325,26 @@ static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, return __blkg_prfill_u64(sf, pd, v); } -static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_u64, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int tg_print_conf_uint(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, + blkcg_print_blkgs(sf, css_to_blkcg(css), tg_prfill_conf_uint, &blkcg_policy_throtl, cft->private, false); return 0; } -static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, - bool is_u64) +static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, + const char *buf, bool is_u64) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct throtl_grp *tg; struct throtl_service_queue *sq; @@ -1403,16 +1403,16 @@ static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, return 0; } -static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_u64(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, true); + return tg_set_conf(css, cft, buf, true); } -static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, +static int tg_set_conf_uint(struct cgroup_subsys_state *css, struct cftype *cft, const char *buf) { - return tg_set_conf(cgrp, cft, buf, false); + return tg_set_conf(css, cft, buf, false); } static struct cftype throtl_files[] = { diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index d5bbdcfd0dab..dabb9d02cf9a 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1607,12 +1607,11 @@ static u64 cfqg_prfill_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); } -static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } @@ -1626,35 +1625,34 @@ static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); } -static int cfqg_print_leaf_weight_device(struct cgroup *cgrp, +static int cfqg_print_leaf_weight_device(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), - cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, 0, - false); + blkcg_print_blkgs(sf, css_to_blkcg(css), cfqg_prfill_leaf_weight_device, + &blkcg_policy_cfq, 0, false); return 0; } -static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft, +static int cfq_print_weight(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_weight); return 0; } -static int cfq_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfq_print_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - seq_printf(sf, "%u\n", - cgroup_to_blkcg(cgrp)->cfq_leaf_weight); + seq_printf(sf, "%u\n", css_to_blkcg(css)->cfq_leaf_weight); return 0; } -static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf, bool is_leaf_weight) +static int __cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf, + bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkg_conf_ctx ctx; struct cfq_group *cfqg; int ret; @@ -1680,22 +1678,22 @@ static int __cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, return ret; } -static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, false); + return __cfqg_set_weight_device(css, cft, buf, false); } -static int cfqg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cfqg_set_leaf_weight_device(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - return __cfqg_set_weight_device(cgrp, cft, buf, true); + return __cfqg_set_weight_device(css, cft, buf, true); } -static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, - bool is_leaf_weight) +static int __cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val, bool is_leaf_weight) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); struct blkcg_gq *blkg; if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX) @@ -1727,30 +1725,32 @@ static int __cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val, return 0; } -static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - return __cfq_set_weight(cgrp, cft, val, false); + return __cfq_set_weight(css, cft, val, false); } -static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - return __cfq_set_weight(cgrp, cft, val, true); + return __cfq_set_weight(css, cft, val, true); } -static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft, +static int cfqg_print_stat(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_cfq, cft->private, true); @@ -1773,20 +1773,20 @@ static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, return __blkg_prfill_rwstat(sf, pd, &sum); } -static int cfqg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_stat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_stat_recursive, &blkcg_policy_cfq, cft->private, false); return 0; } -static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_rwstat_recursive(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, cft->private, true); @@ -1810,10 +1810,10 @@ static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, } /* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *sf) +static int cfqg_print_avg_queue_size(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *sf) { - struct blkcg *blkcg = cgroup_to_blkcg(cgrp); + struct blkcg *blkcg = css_to_blkcg(css); blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, 0, false); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b0d5f53ae5e1..0b91436c68ef 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -439,34 +439,34 @@ struct cftype { struct cgroup_subsys *ss; int (*open)(struct inode *inode, struct file *file); - ssize_t (*read)(struct cgroup *cgrp, struct cftype *cft, + ssize_t (*read)(struct cgroup_subsys_state *css, struct cftype *cft, struct file *file, char __user *buf, size_t nbytes, loff_t *ppos); /* * read_u64() is a shortcut for the common case of returning a * single integer. Use it in place of read() */ - u64 (*read_u64)(struct cgroup *cgrp, struct cftype *cft); + u64 (*read_u64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_s64() is a signed version of read_u64() */ - s64 (*read_s64)(struct cgroup *cgrp, struct cftype *cft); + s64 (*read_s64)(struct cgroup_subsys_state *css, struct cftype *cft); /* * read_map() is used for defining a map of key/value * pairs. It should call cb->fill(cb, key, value) for each * entry. The key/value pairs (and their ordering) should not * change between reboots. */ - int (*read_map)(struct cgroup *cgrp, struct cftype *cft, + int (*read_map)(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb); /* * read_seq_string() is used for outputting a simple sequence * using seqfile. */ - int (*read_seq_string)(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *m); + int (*read_seq_string)(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m); - ssize_t (*write)(struct cgroup *cgrp, struct cftype *cft, + ssize_t (*write)(struct cgroup_subsys_state *css, struct cftype *cft, struct file *file, const char __user *buf, size_t nbytes, loff_t *ppos); @@ -475,18 +475,20 @@ struct cftype { * a single integer (as parsed by simple_strtoull) from * userspace. Use in place of write(); return 0 or error. */ - int (*write_u64)(struct cgroup *cgrp, struct cftype *cft, u64 val); + int (*write_u64)(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val); /* * write_s64() is a signed version of write_u64() */ - int (*write_s64)(struct cgroup *cgrp, struct cftype *cft, s64 val); + int (*write_s64)(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val); /* * write_string() is passed a nul-terminated kernelspace * buffer of maximum length determined by max_write_len. * Returns 0 or -ve error code. */ - int (*write_string)(struct cgroup *cgrp, struct cftype *cft, + int (*write_string)(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer); /* * trigger() callback can be used to get some kick from the @@ -494,7 +496,7 @@ struct cftype { * at all. The private field can be used to determine the * kick type for multiplexing. */ - int (*trigger)(struct cgroup *cgrp, unsigned int event); + int (*trigger)(struct cgroup_subsys_state *css, unsigned int event); int (*release)(struct inode *inode, struct file *file); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7b4d9d79570b..6c416092e324 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,7 +85,7 @@ extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); -extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); +extern struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css); static inline bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c049992f1ffa..6ee469837fda 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2235,34 +2235,38 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) } EXPORT_SYMBOL_GPL(cgroup_attach_task_all); -static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) +static int cgroup_tasks_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 pid) { - return attach_task_by_pid(cgrp, pid, false); + return attach_task_by_pid(css->cgroup, pid, false); } -static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) +static int cgroup_procs_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 tgid) { - return attach_task_by_pid(cgrp, tgid, true); + return attach_task_by_pid(css->cgroup, tgid, true); } -static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_release_agent_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { - BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); + BUILD_BUG_ON(sizeof(css->cgroup->root->release_agent_path) < PATH_MAX); if (strlen(buffer) >= PATH_MAX) return -EINVAL; - if (!cgroup_lock_live_group(cgrp)) + if (!cgroup_lock_live_group(css->cgroup)) return -ENODEV; mutex_lock(&cgroup_root_mutex); - strcpy(cgrp->root->release_agent_path, buffer); + strcpy(css->cgroup->root->release_agent_path, buffer); mutex_unlock(&cgroup_root_mutex); mutex_unlock(&cgroup_mutex); return 0; } -static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_release_agent_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { + struct cgroup *cgrp = css->cgroup; + if (!cgroup_lock_live_group(cgrp)) return -ENODEV; seq_puts(seq, cgrp->root->release_agent_path); @@ -2271,10 +2275,10 @@ static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, return 0; } -static int cgroup_sane_behavior_show(struct cgroup *cgrp, struct cftype *cft, - struct seq_file *seq) +static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { - seq_printf(seq, "%d\n", cgroup_sane_behavior(cgrp)); + seq_printf(seq, "%d\n", cgroup_sane_behavior(css->cgroup)); return 0; } @@ -2292,10 +2296,10 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 -static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_X64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2313,22 +2317,22 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, u64 val = simple_strtoull(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_u64(cgrp, cft, val); + retval = cft->write_u64(css, cft, val); } else { s64 val = simple_strtoll(strstrip(buffer), &end, 0); if (*end) return -EINVAL; - retval = cft->write_s64(cgrp, cft, val); + retval = cft->write_s64(css, cft, val); } if (!retval) retval = nbytes; return retval; } -static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static ssize_t cgroup_write_string(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + const char __user *userbuf, size_t nbytes, + loff_t *unused_ppos) { char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; int retval = 0; @@ -2351,7 +2355,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, } buffer[nbytes] = 0; /* nul-terminate */ - retval = cft->write_string(cgrp, cft, strstrip(buffer)); + retval = cft->write_string(css, cft, strstrip(buffer)); if (!retval) retval = nbytes; out: @@ -2361,60 +2365,60 @@ out: } static ssize_t cgroup_file_write(struct file *file, const char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->write) - return cft->write(cgrp, cft, file, buf, nbytes, ppos); + return cft->write(css, cft, file, buf, nbytes, ppos); if (cft->write_u64 || cft->write_s64) - return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_X64(css, cft, file, buf, nbytes, ppos); if (cft->write_string) - return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_write_string(css, cft, file, buf, nbytes, ppos); if (cft->trigger) { - int ret = cft->trigger(cgrp, (unsigned int)cft->private); + int ret = cft->trigger(css, (unsigned int)cft->private); return ret ? ret : nbytes; } return -EINVAL; } -static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - u64 val = cft->read_u64(cgrp, cft); + u64 val = cft->read_u64(css, cft); int len = sprintf(tmp, "%llu\n", (unsigned long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } -static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static ssize_t cgroup_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { char tmp[CGROUP_LOCAL_BUFFER_SIZE]; - s64 val = cft->read_s64(cgrp, cft); + s64 val = cft->read_s64(css, cft); int len = sprintf(tmp, "%lld\n", (long long) val); return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); } static ssize_t cgroup_file_read(struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) + size_t nbytes, loff_t *ppos) { + struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read) - return cft->read(cgrp, cft, file, buf, nbytes, ppos); + return cft->read(css, cft, file, buf, nbytes, ppos); if (cft->read_u64) - return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_u64(css, cft, file, buf, nbytes, ppos); if (cft->read_s64) - return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); + return cgroup_read_s64(css, cft, file, buf, nbytes, ppos); return -EINVAL; } @@ -2433,16 +2437,16 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css = cgroup_file_css(cfe); if (cft->read_map) { struct cgroup_map_cb cb = { .fill = cgroup_map_add, .state = m, }; - return cft->read_map(cgrp, cft, &cb); + return cft->read_map(css, cft, &cb); } - return cft->read_seq_string(cgrp, cft, m); + return cft->read_seq_string(css, cft, m); } static const struct file_operations cgroup_seqfile_operations = { @@ -3860,21 +3864,20 @@ static int cgroup_procs_open(struct inode *unused, struct file *file) return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); } -static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft) { - return notify_on_release(cgrp); + return notify_on_release(css->cgroup); } -static int cgroup_write_notify_on_release(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - clear_bit(CGRP_RELEASABLE, &cgrp->flags); + clear_bit(CGRP_RELEASABLE, &css->cgroup->flags); if (val) - set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); else - clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); + clear_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags); return 0; } @@ -3972,9 +3975,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int cgroup_write_event_control(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { + struct cgroup *cgrp = css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4082,20 +4086,19 @@ out_kfree: return ret; } -static u64 cgroup_clone_children_read(struct cgroup *cgrp, - struct cftype *cft) +static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); } -static int cgroup_clone_children_write(struct cgroup *cgrp, - struct cftype *cft, - u64 val) +static int cgroup_clone_children_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { if (val) - set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + set_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); else - clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); + clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags); return 0; } @@ -5585,17 +5588,19 @@ static void debug_css_free(struct cgroup_subsys_state *css) kfree(css); } -static u64 debug_taskcount_read(struct cgroup *cgrp, struct cftype *cft) +static u64 debug_taskcount_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return cgroup_task_count(cgrp); + return cgroup_task_count(css->cgroup); } -static u64 current_css_set_read(struct cgroup *cgrp, struct cftype *cft) +static u64 current_css_set_read(struct cgroup_subsys_state *css, + struct cftype *cft) { return (u64)(unsigned long)current->cgroups; } -static u64 current_css_set_refcount_read(struct cgroup *cgrp, +static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, struct cftype *cft) { u64 count; @@ -5606,7 +5611,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cgrp, return count; } -static int current_css_set_cg_links_read(struct cgroup *cgrp, +static int current_css_set_cg_links_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *seq) { @@ -5633,14 +5638,13 @@ static int current_css_set_cg_links_read(struct cgroup *cgrp, } #define MAX_TASKS_SHOWN_PER_CSS 25 -static int cgroup_css_links_read(struct cgroup *cgrp, - struct cftype *cft, - struct seq_file *seq) +static int cgroup_css_links_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *seq) { struct cgrp_cset_link *link; read_lock(&css_set_lock); - list_for_each_entry(link, &cgrp->cset_links, cset_link) { + list_for_each_entry(link, &css->cgroup->cset_links, cset_link) { struct css_set *cset = link->cset; struct task_struct *task; int count = 0; @@ -5659,9 +5663,9 @@ static int cgroup_css_links_read(struct cgroup *cgrp, return 0; } -static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) +static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return test_bit(CGRP_RELEASABLE, &cgrp->flags); + return test_bit(CGRP_RELEASABLE, &css->cgroup->flags); } static struct cftype debug_files[] = { diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f03a85719c3c..19613ba51444 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -245,7 +245,7 @@ out: /** * update_if_frozen - update whether a cgroup finished freezing - * @cgroup: cgroup of interest + * @css: css of interest * * Once FREEZING is initiated, transition to FROZEN is lazily updated by * calling this function. If the current state is FREEZING but not FROZEN, @@ -256,12 +256,12 @@ out: * update_if_frozen() on all descendants prior to invoking this function. * * Task states and freezer state might disagree while tasks are being - * migrated into or out of @cgroup, so we can't verify task states against + * migrated into or out of @css, so we can't verify task states against * @freezer state here. See freezer_attach() for details. */ -static void update_if_frozen(struct cgroup *cgroup) +static void update_if_frozen(struct cgroup_subsys_state *css) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); struct cgroup *pos; struct cgroup_iter it; struct task_struct *task; @@ -275,7 +275,7 @@ static void update_if_frozen(struct cgroup *cgroup) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, cgroup) { + cgroup_for_each_child(pos, css->cgroup) { struct freezer *child = cgroup_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && @@ -284,9 +284,9 @@ static void update_if_frozen(struct cgroup *cgroup) } /* are all tasks frozen? */ - cgroup_iter_start(cgroup, &it); + cgroup_iter_start(css->cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + while ((task = cgroup_iter_next(css->cgroup, &it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -301,12 +301,12 @@ static void update_if_frozen(struct cgroup *cgroup) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(cgroup, &it); + cgroup_iter_end(css->cgroup, &it); out_unlock: spin_unlock_irq(&freezer->lock); } -static int freezer_read(struct cgroup *cgroup, struct cftype *cft, +static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { struct cgroup *pos; @@ -314,13 +314,13 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft, rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, cgroup) - update_if_frozen(pos); - update_if_frozen(cgroup); + cgroup_for_each_descendant_post(pos, css->cgroup) + update_if_frozen(cgroup_css(pos, freezer_subsys_id)); + update_if_frozen(css); rcu_read_unlock(); - seq_puts(m, freezer_state_strs(cgroup_freezer(cgroup)->state)); + seq_puts(m, freezer_state_strs(css_freezer(css)->state)); seq_putc(m, '\n'); return 0; } @@ -426,7 +426,7 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) rcu_read_unlock(); } -static int freezer_write(struct cgroup *cgroup, struct cftype *cft, +static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { bool freeze; @@ -438,20 +438,22 @@ static int freezer_write(struct cgroup *cgroup, struct cftype *cft, else return -EINVAL; - freezer_change_state(cgroup_freezer(cgroup), freeze); + freezer_change_state(css_freezer(css), freeze); return 0; } -static u64 freezer_self_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_self_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_SELF); } -static u64 freezer_parent_freezing_read(struct cgroup *cgroup, struct cftype *cft) +static u64 freezer_parent_freezing_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct freezer *freezer = cgroup_freezer(cgroup); + struct freezer *freezer = css_freezer(css); return (bool)(freezer->state & CGROUP_FREEZING_PARENT); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ce3fdc3dfcc..89b76e1d3aa1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1603,9 +1603,10 @@ typedef enum { FILE_SPREAD_SLAB, } cpuset_filetype_t; -static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) +static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft, + u64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1650,9 +1651,10 @@ out_unlock: return retval; } -static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) +static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft, + s64 val) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; int retval = -ENODEV; @@ -1676,10 +1678,10 @@ out_unlock: /* * Common handling for a write to a "cpus" or "mems" file. */ -static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, - const char *buf) +static int cpuset_write_resmask(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buf) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); struct cpuset *trialcs; int retval = -ENODEV; @@ -1758,13 +1760,12 @@ static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs) return count; } -static ssize_t cpuset_common_file_read(struct cgroup *cgrp, - struct cftype *cft, - struct file *file, - char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t cpuset_common_file_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; char *page; ssize_t retval = 0; @@ -1794,9 +1795,9 @@ out: return retval; } -static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuset_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_CPU_EXCLUSIVE: @@ -1825,9 +1826,9 @@ static u64 cpuset_read_u64(struct cgroup *cgrp, struct cftype *cft) return 0; } -static s64 cpuset_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpuset_read_s64(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuset *cs = cgroup_cs(cgrp); + struct cpuset *cs = css_cs(css); cpuset_filetype_t type = cft->private; switch (type) { case FILE_SCHED_RELAX_DOMAIN_LEVEL: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 622b7efc5ade..cc9a49266382 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7088,12 +7088,6 @@ static inline struct task_group *css_tg(struct cgroup_subsys_state *css) return css ? container_of(css, struct task_group, css) : NULL; } -/* return corresponding task_group object of a cgroup */ -static inline struct task_group *cgroup_tg(struct cgroup *cgrp) -{ - return css_tg(cgroup_css(cgrp, cpu_cgroup_subsys_id)); -} - static struct cgroup_subsys_state * cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -7179,15 +7173,16 @@ static void cpu_cgroup_exit(struct cgroup_subsys_state *css, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 shareval) +static int cpu_shares_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 shareval) { - return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval)); + return sched_group_set_shares(css_tg(css), scale_load(shareval)); } -static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); return (u64) scale_load_down(tg->shares); } @@ -7309,26 +7304,28 @@ long tg_get_cfs_period(struct task_group *tg) return cfs_period_us; } -static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_quota(cgroup_tg(cgrp)); + return tg_get_cfs_quota(css_tg(css)); } -static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype, - s64 cfs_quota_us) +static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css, + struct cftype *cftype, s64 cfs_quota_us) { - return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us); + return tg_set_cfs_quota(css_tg(css), cfs_quota_us); } -static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) { - return tg_get_cfs_period(cgroup_tg(cgrp)); + return tg_get_cfs_period(css_tg(css)); } -static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype, - u64 cfs_period_us) +static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 cfs_period_us) { - return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us); + return tg_set_cfs_period(css_tg(css), cfs_period_us); } struct cfs_schedulable_data { @@ -7409,10 +7406,10 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, +static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct task_group *tg = cgroup_tg(cgrp); + struct task_group *tg = css_tg(css); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; cb->fill(cb, "nr_periods", cfs_b->nr_periods); @@ -7425,26 +7422,28 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft, #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED -static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - s64 val) +static int cpu_rt_runtime_write(struct cgroup_subsys_state *css, + struct cftype *cft, s64 val) { - return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); + return sched_group_set_rt_runtime(css_tg(css), val); } -static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) +static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_runtime(cgroup_tg(cgrp)); + return sched_group_rt_runtime(css_tg(css)); } -static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, - u64 rt_period_us) +static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 rt_period_us) { - return sched_group_set_rt_period(cgroup_tg(cgrp), rt_period_us); + return sched_group_set_rt_period(css_tg(css), rt_period_us); } -static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, + struct cftype *cft) { - return sched_group_rt_period(cgroup_tg(cgrp)); + return sched_group_rt_period(css_tg(css)); } #endif /* CONFIG_RT_GROUP_SCHED */ diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 1b784d9b3630..f64722ff0299 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -38,12 +38,6 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuacct, css) : NULL; } -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return css_ca(cgroup_css(cgrp, cpuacct_subsys_id)); -} - /* return cpu accounting group to which this task belongs */ static inline struct cpuacct *task_ca(struct task_struct *tsk) { @@ -138,9 +132,9 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) } /* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; int i; @@ -150,10 +144,10 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) return totalcpuusage; } -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) +static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 reset) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int err = 0; int i; @@ -169,10 +163,10 @@ out: return err; } -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct cpuacct *ca = cgroup_ca(cgroup); + struct cpuacct *ca = css_ca(css); u64 percpu; int i; @@ -189,10 +183,10 @@ static const char * const cpuacct_stat_desc[] = { [CPUACCT_STAT_SYSTEM] = "system", }; -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) +static int cpuacct_stats_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct cgroup_map_cb *cb) { - struct cpuacct *ca = cgroup_ca(cgrp); + struct cpuacct *ca = css_ca(css); int cpu; s64 val = 0; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index e2132435060f..bda8e44f6fde 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -39,12 +39,6 @@ struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) return s ? container_of(s, struct hugetlb_cgroup, css) : NULL; } -static inline -struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) -{ - return hugetlb_cgroup_from_css(cgroup_css(cgroup, hugetlb_subsys_id)); -} - static inline struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) { @@ -248,14 +242,15 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } -static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t hugetlb_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, + loff_t *ppos) { u64 val; char str[64]; int idx, name, len; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); @@ -265,12 +260,12 @@ static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, - const char *buffer) +static int hugetlb_cgroup_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { int idx, name, ret; unsigned long long val; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(cft->private); name = MEMFILE_ATTR(cft->private); @@ -295,10 +290,11 @@ static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, return ret; } -static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) +static int hugetlb_cgroup_reset(struct cgroup_subsys_state *css, + unsigned int event) { int idx, name, ret = 0; - struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css); idx = MEMFILE_IDX(event); name = MEMFILE_ATTR(event); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32cca0f0af0d..ab64dfc84f8c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -483,7 +483,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { return s ? container_of(s, struct mem_cgroup, css) : NULL; @@ -1035,7 +1034,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) +static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); } @@ -2951,10 +2950,10 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) } #ifdef CONFIG_SLABINFO -static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct memcg_cache_params *params; if (!memcg_can_account_kmem(memcg)) @@ -4999,9 +4998,10 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) return 0; } -static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) +static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, + unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int ret; if (mem_cgroup_is_root(memcg)) @@ -5014,16 +5014,17 @@ static int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) } -static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft) +static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - return mem_cgroup_from_cont(cont)->use_hierarchy; + return mem_cgroup_from_css(css)->use_hierarchy; } -static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, - u64 val) +static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); mutex_lock(&memcg_create_mutex); @@ -5094,11 +5095,11 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) return val << PAGE_SHIFT; } -static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, - struct file *file, char __user *buf, - size_t nbytes, loff_t *ppos) +static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct file *file, + char __user *buf, size_t nbytes, loff_t *ppos) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); char str[64]; u64 val; int name, len; @@ -5131,11 +5132,11 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, return simple_read_from_buffer(buf, nbytes, ppos, str, len); } -static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) +static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) { int ret = -EINVAL; #ifdef CONFIG_MEMCG_KMEM - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); /* * For simplicity, we won't allow this to be disabled. It also can't * be changed if the cgroup has children already, or if tasks had @@ -5151,7 +5152,7 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { - if (cgroup_task_count(cont) || memcg_has_children(memcg)) { + if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; } @@ -5221,10 +5222,10 @@ out: * The user of this function is... * RES_LIMIT. */ -static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); enum res_type type; int name; unsigned long long val; @@ -5248,7 +5249,7 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else if (type == _MEMSWAP) ret = mem_cgroup_resize_memsw_limit(memcg, val); else if (type == _KMEM) - ret = memcg_update_kmem_limit(cont, val); + ret = memcg_update_kmem_limit(css, val); else return -EINVAL; break; @@ -5297,9 +5298,9 @@ out: *memsw_limit = min_memsw_limit; } -static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) +static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); int name; enum res_type type; @@ -5332,17 +5333,17 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) return 0; } -static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, +static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; + return mem_cgroup_from_css(css)->move_charge_at_immigrate; } #ifdef CONFIG_MMU -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; @@ -5357,7 +5358,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, return 0; } #else -static int mem_cgroup_move_charge_write(struct cgroup *cgrp, +static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { return -ENOSYS; @@ -5365,13 +5366,13 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, #endif #ifdef CONFIG_NUMA -static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, - struct seq_file *m) +static int memcg_numa_stat_show(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { int nid; unsigned long total_nr, file_nr, anon_nr, unevictable_nr; unsigned long node_nr; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); seq_printf(m, "total=%lu", total_nr); @@ -5416,10 +5417,10 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, +static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *mi; unsigned int i; @@ -5503,17 +5504,18 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, return 0; } -static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft) +static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, + struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); return mem_cgroup_swappiness(memcg); } -static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, - u64 val) +static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); if (val > 100 || !parent) @@ -5829,10 +5831,10 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_unlock(&memcg_oom_lock); } -static int mem_cgroup_oom_control_read(struct cgroup *cgrp, +static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); @@ -5843,10 +5845,10 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, return 0; } -static int mem_cgroup_oom_control_write(struct cgroup *cgrp, +static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 7f1654d3cec7..2a8a736e95cc 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -81,8 +81,8 @@ static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { - struct cgroup *cg = vmpressure_to_css(vmpr)->cgroup; - struct mem_cgroup *memcg = mem_cgroup_from_cont(cg); + struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); memcg = parent_mem_cgroup(memcg); if (!memcg) diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 8d095b4c2f6f..e00f60e5baea 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -168,15 +168,14 @@ static void cgrp_css_free(struct cgroup_subsys_state *css) kfree(css); } -static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft) +static u64 read_prioidx(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp->id; + return css->cgroup->id; } -static int read_priomap(struct cgroup *cont, struct cftype *cft, +static int read_priomap(struct cgroup_subsys_state *css, struct cftype *cft, struct cgroup_map_cb *cb) { - struct cgroup_subsys_state *css = cgroup_css(cont, net_prio_subsys_id); struct net_device *dev; rcu_read_lock(); @@ -186,10 +185,9 @@ static int read_priomap(struct cgroup *cont, struct cftype *cft, return 0; } -static int write_priomap(struct cgroup *cgrp, struct cftype *cft, +static int write_priomap(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, net_prio_subsys_id); char devname[IFNAMSIZ + 1]; struct net_device *dev; u32 prio; diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index da14436c1735..8a57d79b0b16 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -132,10 +132,10 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) return 0; } -static int tcp_cgroup_write(struct cgroup *cont, struct cftype *cft, +static int tcp_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, const char *buffer) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); unsigned long long val; int ret = 0; @@ -180,9 +180,9 @@ static u64 tcp_read_usage(struct mem_cgroup *memcg) return res_counter_read_u64(&tcp->tcp_memory_allocated, RES_USAGE); } -static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) +static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); u64 val; switch (cft->private) { @@ -202,13 +202,13 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft) return val; } -static int tcp_cgroup_reset(struct cgroup *cont, unsigned int event) +static int tcp_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) { struct mem_cgroup *memcg; struct tcp_memcontrol *tcp; struct cg_proto *cg_proto; - memcg = mem_cgroup_from_cont(cont); + memcg = mem_cgroup_from_css(css); cg_proto = tcp_prot.proto_cgroup(memcg); if (!cg_proto) return 0; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index dc3983835893..8ea1184cec92 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -28,11 +28,6 @@ static inline struct cgroup_cls_state *css_cls_state(struct cgroup_subsys_state return css ? container_of(css, struct cgroup_cls_state, css) : NULL; } -static inline struct cgroup_cls_state *cgrp_cls_state(struct cgroup *cgrp) -{ - return css_cls_state(cgroup_css(cgrp, net_cls_subsys_id)); -} - static inline struct cgroup_cls_state *task_cls_state(struct task_struct *p) { return css_cls_state(task_css(p, net_cls_subsys_id)); @@ -87,14 +82,15 @@ static void cgrp_attach(struct cgroup_subsys_state *css, } } -static u64 read_classid(struct cgroup *cgrp, struct cftype *cft) +static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft) { - return cgrp_cls_state(cgrp)->classid; + return css_cls_state(css)->classid; } -static int write_classid(struct cgroup *cgrp, struct cftype *cft, u64 value) +static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft, + u64 value) { - cgrp_cls_state(cgrp)->classid = (u32) value; + css_cls_state(css)->classid = (u32) value; return 0; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 7293ac49ba7b..e0ca464fa854 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -289,10 +289,10 @@ static void set_majmin(char *str, unsigned m) sprintf(str, "%u", m); } -static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) +static int devcgroup_seq_read(struct cgroup_subsys_state *css, + struct cftype *cft, struct seq_file *m) { - struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); + struct dev_cgroup *devcgroup = css_to_devcgroup(css); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; @@ -669,13 +669,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, return rc; } -static int devcgroup_access_write(struct cgroup *cgrp, struct cftype *cft, - const char *buffer) +static int devcgroup_access_write(struct cgroup_subsys_state *css, + struct cftype *cft, const char *buffer) { int retval; mutex_lock(&devcgroup_mutex); - retval = devcgroup_update_access(cgroup_to_devcgroup(cgrp), + retval = devcgroup_update_access(css_to_devcgroup(css), cft->private, buffer); mutex_unlock(&devcgroup_mutex); return retval; -- cgit v1.3-8-gc7d7 From 3b287a505ef4024634beb12a93773254909d5dae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: convert cgroup_next_sibling() to cgroup_next_child() cgroup is transitioning to using css (cgroup_subsys_state) as the main subsys interface handle instead of cgroup and the iterators will be updated to use css too. The iterators need to walk the cgroup hierarchy and return the css's matching the origin css, which is a bit cumbersome to open code. This patch converts cgroup_next_sibling() to cgroup_next_child() so that it can handle all steps of direct child iteration. This will be used to update iterators to take @css instead of @cgrp. In addition to the new iteration init handling, cgroup_next_child() is restructured so that the different branches share the end of iteration condition check. This patch doesn't change any behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 4 ++-- kernel/cgroup.c | 59 +++++++++++++++++++++++++------------------------- 2 files changed, 32 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0b91436c68ef..5f9ba5881717 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -779,7 +779,7 @@ static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) return idr_find(&ss->root->cgroup_idr, id); } -struct cgroup *cgroup_next_sibling(struct cgroup *pos); +struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp); /** * cgroup_for_each_child - iterate through children of a cgroup @@ -802,7 +802,7 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos); #define cgroup_for_each_child(pos, cgrp) \ for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ struct cgroup, sibling); \ - (pos); (pos) = cgroup_next_sibling((pos))) + (pos); (pos) = cgroup_next_child((pos), (cgrp))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6ee469837fda..dd55244952bd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3037,15 +3037,16 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_sibling - find the next sibling of a given cgroup - * @pos: the current cgroup + * cgroup_next_child - find the next child of a given cgroup + * @pos: the current position (%NULL to initiate traversal) + * @cgrp: cgroup whose descendants to walk * - * This function returns the next sibling of @pos and should be called - * under RCU read lock. The only requirement is that @pos is accessible. - * The next sibling is guaranteed to be returned regardless of @pos's - * state. + * This function returns the next child of @cgrp and should be called under + * RCU read lock. The only requirement is that @cgrp and @pos are + * accessible. The next sibling is guaranteed to be returned regardless of + * their states. */ -struct cgroup *cgroup_next_sibling(struct cgroup *pos) +struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) { struct cgroup *next; @@ -3061,30 +3062,30 @@ struct cgroup *cgroup_next_sibling(struct cgroup *pos) * safe to dereference from this RCU critical section. If * ->sibling.next is inaccessible, cgroup_is_dead() is guaranteed * to be visible as %true here. + * + * If @pos is dead, its next pointer can't be dereferenced; + * however, as each cgroup is given a monotonically increasing + * unique serial number and always appended to the sibling list, + * the next one can be found by walking the parent's children until + * we see a cgroup with higher serial number than @pos's. While + * this path can be slower, it's taken only when either the current + * cgroup is removed or iteration and removal race. */ - if (likely(!cgroup_is_dead(pos))) { + if (!pos) { + next = list_entry_rcu(cgrp->children.next, struct cgroup, sibling); + } else if (likely(!cgroup_is_dead(pos))) { next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); - if (&next->sibling != &pos->parent->children) - return next; - return NULL; + } else { + list_for_each_entry_rcu(next, &cgrp->children, sibling) + if (next->serial_nr > pos->serial_nr) + break; } - /* - * Can't dereference the next pointer. Each cgroup is given a - * monotonically increasing unique serial number and always - * appended to the sibling list, so the next one can be found by - * walking the parent's children until we see a cgroup with higher - * serial number than @pos's. - * - * While this path can be slow, it's taken only when either the - * current cgroup is removed or iteration and removal race. - */ - list_for_each_entry_rcu(next, &pos->parent->children, sibling) - if (next->serial_nr > pos->serial_nr) - return next; + if (&next->sibling != &cgrp->children) + return next; return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_sibling); +EXPORT_SYMBOL_GPL(cgroup_next_child); /** * cgroup_next_descendant_pre - find the next descendant for pre-order walk @@ -3117,7 +3118,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, /* no child, visit my or the closest ancestor's next sibling */ while (pos != cgroup) { - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return next; pos = pos->parent; @@ -3198,7 +3199,7 @@ struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_sibling(pos); + next = cgroup_next_child(pos, pos->parent); if (next) return cgroup_leftmost_descendant(next); @@ -4549,9 +4550,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_sibling() to + * CGRP_DEAD assertion is depended upon by cgroup_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_sibling() for details. + * cgroup_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); -- cgit v1.3-8-gc7d7 From f48e3924dca268c677c4e338e5d91ad9e6fe6b9e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:24 -0400 Subject: cgroup: always use cgroup_next_child() to walk the children list There are several places where the children list is accessed directly. This patch converts those places to use cgroup_next_child(). This will help updating the hierarchy iterators to use @css instead of @cgrp. While cgroup_next_child() can be heavy in pathological cases - e.g. a lot of dead children, this shouldn't cause any noticeable behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 5 ++--- kernel/cgroup.c | 7 +++---- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5f9ba5881717..c288bce428f8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -800,9 +800,8 @@ struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp); * the start of the next iteration by, for example, bumping the css refcnt. */ #define cgroup_for_each_child(pos, cgrp) \ - for ((pos) = list_first_or_null_rcu(&(cgrp)->children, \ - struct cgroup, sibling); \ - (pos); (pos) = cgroup_next_child((pos), (cgrp))) + for ((pos) = cgroup_next_child(NULL, (cgrp)); (pos); \ + (pos) = cgroup_next_child((pos), (cgrp))) struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, struct cgroup *cgroup); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index dd55244952bd..2b7354faaca7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3112,7 +3112,7 @@ struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, pos = cgroup; /* visit the first child if exists */ - next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); + next = cgroup_next_child(NULL, pos); if (next) return next; @@ -3151,7 +3151,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - list_for_each_entry_rcu(tmp, &last->children, sibling) + cgroup_for_each_child(tmp, last) pos = tmp; } while (pos); @@ -3165,8 +3165,7 @@ static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) do { last = pos; - pos = list_first_or_null_rcu(&pos->children, struct cgroup, - sibling); + pos = cgroup_next_child(NULL, pos); } while (pos); return last; -- cgit v1.3-8-gc7d7 From 492eb21b98f88e411a8bb43d6edcd7d7022add10 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:25 -0400 Subject: cgroup: make hierarchy iterators deal with cgroup_subsys_state instead of cgroup cgroup is currently in the process of transitioning to using css (cgroup_subsys_state) as the primary handle instead of cgroup in subsystem API. For hierarchy iterators, this is beneficial because * In most cases, css is the only thing subsystems care about anyway. * On the planned unified hierarchy, iterations for different subsystems will need to skip over different subtrees of the hierarchy depending on which subsystems are enabled on each cgroup. Passing around css makes it unnecessary to explicitly specify the subsystem in question as css is intersection between cgroup and subsystem * For the planned unified hierarchy, css's would need to be created and destroyed dynamically independent from cgroup hierarchy. Having cgroup core manage css iteration makes enforcing deref rules a lot easier. Most subsystem conversions are straight-forward. Noteworthy changes are * blkio: cgroup_to_blkcg() is no longer used. Removed. * freezer: cgroup_freezer() is no longer used. Removed. * devices: cgroup_to_devcgroup() is no longer used. Removed. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley Cc: Jens Axboe --- block/blk-cgroup.c | 8 +-- block/blk-cgroup.h | 25 ++++----- block/blk-throttle.c | 8 +-- include/linux/cgroup.h | 88 ++++++++++++++++--------------- kernel/cgroup.c | 131 ++++++++++++++++++++++++++--------------------- kernel/cgroup_freezer.c | 25 ++++----- kernel/cpuset.c | 58 ++++++++++----------- mm/memcontrol.c | 20 ++++---- security/device_cgroup.c | 11 ++-- 9 files changed, 187 insertions(+), 187 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index f46f3c69179c..4b40640240a4 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -614,7 +614,7 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; u64 sum; lockdep_assert_held(pd->blkg->q->queue_lock); @@ -622,7 +622,7 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) sum = blkg_stat_read((void *)pd + off); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_stat *stat = (void *)pos_pd + off; @@ -649,7 +649,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, { struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct blkg_rwstat sum; int i; @@ -658,7 +658,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, sum = blkg_rwstat_read((void *)pd + off); rcu_read_lock(); - blkg_for_each_descendant_pre(pos_blkg, pos_cgrp, pd_to_blkg(pd)) { + blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); struct blkg_rwstat *rwstat = (void *)pos_pd + off; struct blkg_rwstat tmp; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b6802c46d68f..855538630300 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -184,11 +184,6 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) return css ? container_of(css, struct blkcg, css) : NULL; } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) -{ - return css_to_blkcg(cgroup_css(cgroup, blkio_subsys_id)); -} - static inline struct blkcg *task_blkcg(struct task_struct *tsk) { return css_to_blkcg(task_css(tsk, blkio_subsys_id)); @@ -289,32 +284,31 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, /** * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may - * update @pos_cgrp by calling cgroup_rightmost_descendant() to skip - * subtree. + * update @pos_css by calling css_rightmost_descendant() to skip subtree. */ -#define blkg_for_each_descendant_pre(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_pre((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** * blkg_for_each_descendant_post - post-order walk of a blkg's descendants * @d_blkg: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order * traversal instead. Synchronization rules are the same. */ -#define blkg_for_each_descendant_post(d_blkg, pos_cgrp, p_blkg) \ - cgroup_for_each_descendant_post((pos_cgrp), (p_blkg)->blkcg->css.cgroup) \ - if (((d_blkg) = __blkg_lookup(cgroup_to_blkcg(pos_cgrp), \ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css), \ (p_blkg)->q, false))) /** @@ -577,7 +571,6 @@ static inline int blkcg_activate_policy(struct request_queue *q, static inline void blkcg_deactivate_policy(struct request_queue *q, const struct blkcg_policy *pol) { } -static inline struct blkcg *cgroup_to_blkcg(struct cgroup *cgroup) { return NULL; } static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; } static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 88bcfb651b0b..8cefa7f8590e 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1349,7 +1349,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, struct throtl_grp *tg; struct throtl_service_queue *sq; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; int ret; ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); @@ -1380,7 +1380,7 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * blk-throttle. */ tg_update_has_rules(tg); - blkg_for_each_descendant_pre(blkg, pos_cgrp, ctx.blkg) + blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); /* @@ -1623,7 +1623,7 @@ void blk_throtl_drain(struct request_queue *q) { struct throtl_data *td = q->td; struct blkcg_gq *blkg; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; struct bio *bio; int rw; @@ -1636,7 +1636,7 @@ void blk_throtl_drain(struct request_queue *q) * better to walk service_queue tree directly but blkg walk is * easier. */ - blkg_for_each_descendant_post(blkg, pos_cgrp, td->queue->root_blkg) + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); tg_drain_bios(&td_root_tg(td)->service_queue); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c288bce428f8..4bc22f4a1abb 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -779,68 +779,72 @@ static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) return idr_find(&ss->root->cgroup_idr, id); } -struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp); +struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *parent); /** - * cgroup_for_each_child - iterate through children of a cgroup - * @pos: the cgroup * to use as the loop cursor - * @cgrp: cgroup whose children to walk + * css_for_each_child - iterate through children of a css + * @pos: the css * to use as the loop cursor + * @parent: css whose children to walk * - * Walk @cgrp's children. Must be called under rcu_read_lock(). A child - * cgroup which hasn't finished ->css_online() or already has finished + * Walk @parent's children. Must be called under rcu_read_lock(). A child + * css which hasn't finished ->css_online() or already has finished * ->css_offline() may show up during traversal and it's each subsystem's * responsibility to verify that each @pos is alive. * * If a subsystem synchronizes against the parent in its ->css_online() and - * before starting iterating, a cgroup which finished ->css_online() is + * before starting iterating, a css which finished ->css_online() is * guaranteed to be visible in the future iterations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_child(pos, cgrp) \ - for ((pos) = cgroup_next_child(NULL, (cgrp)); (pos); \ - (pos) = cgroup_next_child((pos), (cgrp))) +#define css_for_each_child(pos, parent) \ + for ((pos) = css_next_child(NULL, (parent)); (pos); \ + (pos) = css_next_child((pos), (parent))) -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup); -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); + +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos); /** - * cgroup_for_each_descendant_pre - pre-order walk of a cgroup's descendants - * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose descendants to walk + * css_for_each_descendant_pre - pre-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @root: css whose descendants to walk * - * Walk @cgroup's descendants. Must be called under rcu_read_lock(). A - * descendant cgroup which hasn't finished ->css_online() or already has + * Walk @root's descendants. Must be called under rcu_read_lock(). A + * descendant css which hasn't finished ->css_online() or already has * finished ->css_offline() may show up during traversal and it's each * subsystem's responsibility to verify that each @pos is alive. * * If a subsystem synchronizes against the parent in its ->css_online() and * before starting iterating, and synchronizes against @pos on each - * iteration, any descendant cgroup which finished ->css_online() is + * iteration, any descendant css which finished ->css_online() is * guaranteed to be visible in the future iterations. * * In other words, the following guarantees that a descendant can't escape * state updates of its ancestors. * - * my_online(@cgrp) + * my_online(@css) * { - * Lock @cgrp->parent and @cgrp; - * Inherit state from @cgrp->parent; + * Lock @css's parent and @css; + * Inherit state from the parent; * Unlock both. * } * - * my_update_state(@cgrp) + * my_update_state(@css) * { - * Lock @cgrp; - * Update @cgrp's state; - * Unlock @cgrp; + * Lock @css; + * Update @css's state; + * Unlock @css; * - * cgroup_for_each_descendant_pre(@pos, @cgrp) { + * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; - * Verify @pos is alive and inherit state from @pos->parent; + * Verify @pos is alive and inherit state from @pos's parent; * Unlock @pos; * } * } @@ -851,8 +855,7 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one - * inheritance happens for any cgroup after the latest update to its - * parent. + * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. @@ -865,25 +868,26 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos); * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ -#define cgroup_for_each_descendant_pre(pos, cgroup) \ - for (pos = cgroup_next_descendant_pre(NULL, (cgroup)); (pos); \ - pos = cgroup_next_descendant_pre((pos), (cgroup))) +#define css_for_each_descendant_pre(pos, css) \ + for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ + (pos) = css_next_descendant_pre((pos), (css))) -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup); +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *css); /** - * cgroup_for_each_descendant_post - post-order walk of a cgroup's descendants - * @pos: the cgroup * to use as the loop cursor - * @cgroup: cgroup whose descendants to walk + * css_for_each_descendant_post - post-order walk of a css's descendants + * @pos: the css * to use as the loop cursor + * @css: css whose descendants to walk * - * Similar to cgroup_for_each_descendant_pre() but performs post-order + * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. Note that the walk visibility guarantee described in * pre-order walk doesn't apply the same to post-order walks. */ -#define cgroup_for_each_descendant_post(pos, cgroup) \ - for (pos = cgroup_next_descendant_post(NULL, (cgroup)); (pos); \ - pos = cgroup_next_descendant_post((pos), (cgroup))) +#define css_for_each_descendant_post(pos, css) \ + for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ + (pos) = css_next_descendant_post((pos), (css))) /* A cgroup_iter should be treated as an opaque object */ struct cgroup_iter { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2b7354faaca7..91eac33fac86 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2814,8 +2814,8 @@ static void cgroup_cfts_prepare(void) /* * Thanks to the entanglement with vfs inode locking, we can't walk * the existing cgroups under cgroup_mutex and create files. - * Instead, we use cgroup_for_each_descendant_pre() and drop RCU - * read lock before calling cgroup_addrm_files(). + * Instead, we use css_for_each_descendant_pre() and drop RCU read + * lock before calling cgroup_addrm_files(). */ mutex_lock(&cgroup_mutex); } @@ -2825,10 +2825,11 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) { LIST_HEAD(pending); struct cgroup_subsys *ss = cfts[0].ss; - struct cgroup *cgrp, *root = &ss->root->top_cgroup; + struct cgroup *root = &ss->root->top_cgroup; struct super_block *sb = ss->root->sb; struct dentry *prev = NULL; struct inode *inode; + struct cgroup_subsys_state *css; u64 update_before; int ret = 0; @@ -2861,7 +2862,9 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - cgroup_for_each_descendant_pre(cgrp, root) { + css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + struct cgroup *cgrp = css->cgroup; + if (cgroup_is_dead(cgrp)) continue; @@ -3037,17 +3040,21 @@ static void cgroup_enable_task_cg_lists(void) } /** - * cgroup_next_child - find the next child of a given cgroup - * @pos: the current position (%NULL to initiate traversal) - * @cgrp: cgroup whose descendants to walk + * css_next_child - find the next child of a given css + * @pos_css: the current position (%NULL to initiate traversal) + * @parent_css: css whose children to walk * - * This function returns the next child of @cgrp and should be called under - * RCU read lock. The only requirement is that @cgrp and @pos are - * accessible. The next sibling is guaranteed to be returned regardless of - * their states. + * This function returns the next child of @parent_css and should be called + * under RCU read lock. The only requirement is that @parent_css and + * @pos_css are accessible. The next sibling is guaranteed to be returned + * regardless of their states. */ -struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) +struct cgroup_subsys_state * +css_next_child(struct cgroup_subsys_state *pos_css, + struct cgroup_subsys_state *parent_css) { + struct cgroup *pos = pos_css ? pos_css->cgroup : NULL; + struct cgroup *cgrp = parent_css->cgroup; struct cgroup *next; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3081,59 +3088,64 @@ struct cgroup *cgroup_next_child(struct cgroup *pos, struct cgroup *cgrp) break; } - if (&next->sibling != &cgrp->children) - return next; - return NULL; + if (&next->sibling == &cgrp->children) + return NULL; + + if (parent_css->ss) + return cgroup_css(next, parent_css->ss->subsys_id); + else + return &next->dummy_css; } -EXPORT_SYMBOL_GPL(cgroup_next_child); +EXPORT_SYMBOL_GPL(css_next_child); /** - * cgroup_next_descendant_pre - find the next descendant for pre-order walk + * css_next_descendant_pre - find the next descendant for pre-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_pre(). Find the next - * descendant to visit for pre-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_pre(). Find the next descendant + * to visit for pre-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos - * and @cgroup are accessible and @pos is a descendant of @cgroup. + * and @root are accessible and @pos is a descendant of @root. */ -struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_pre(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @cgroup */ + /* if first iteration, pretend we just visited @root */ if (!pos) - pos = cgroup; + pos = root; /* visit the first child if exists */ - next = cgroup_next_child(NULL, pos); + next = css_next_child(NULL, pos); if (next) return next; /* no child, visit my or the closest ancestor's next sibling */ - while (pos != cgroup) { - next = cgroup_next_child(pos, pos->parent); + while (pos != root) { + next = css_next_child(pos, css_parent(pos)); if (next) return next; - pos = pos->parent; + pos = css_parent(pos); } return NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); +EXPORT_SYMBOL_GPL(css_next_descendant_pre); /** - * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup - * @pos: cgroup of interest + * css_rightmost_descendant - return the rightmost descendant of a css + * @pos: css of interest * - * Return the rightmost descendant of @pos. If there's no descendant, - * @pos is returned. This can be used during pre-order traversal to skip + * Return the rightmost descendant of @pos. If there's no descendant, @pos + * is returned. This can be used during pre-order traversal to skip * subtree of @pos. * * While this function requires RCU read locking, it doesn't require the @@ -3141,9 +3153,10 @@ EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); * function will return the correct rightmost descendant as long as @pos is * accessible. */ -struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) +struct cgroup_subsys_state * +css_rightmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last, *tmp; + struct cgroup_subsys_state *last, *tmp; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -3151,62 +3164,64 @@ struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) last = pos; /* ->prev isn't RCU safe, walk ->next till the end */ pos = NULL; - cgroup_for_each_child(tmp, last) + css_for_each_child(tmp, last) pos = tmp; } while (pos); return last; } -EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); +EXPORT_SYMBOL_GPL(css_rightmost_descendant); -static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) +static struct cgroup_subsys_state * +css_leftmost_descendant(struct cgroup_subsys_state *pos) { - struct cgroup *last; + struct cgroup_subsys_state *last; do { last = pos; - pos = cgroup_next_child(NULL, pos); + pos = css_next_child(NULL, pos); } while (pos); return last; } /** - * cgroup_next_descendant_post - find the next descendant for post-order walk + * css_next_descendant_post - find the next descendant for post-order walk * @pos: the current position (%NULL to initiate traversal) - * @cgroup: cgroup whose descendants to walk + * @root: css whose descendants to walk * - * To be used by cgroup_for_each_descendant_post(). Find the next - * descendant to visit for post-order traversal of @cgroup's descendants. + * To be used by css_for_each_descendant_post(). Find the next descendant + * to visit for post-order traversal of @root's descendants. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This * function will return the correct next descendant as long as both @pos * and @cgroup are accessible and @pos is a descendant of @cgroup. */ -struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, - struct cgroup *cgroup) +struct cgroup_subsys_state * +css_next_descendant_post(struct cgroup_subsys_state *pos, + struct cgroup_subsys_state *root) { - struct cgroup *next; + struct cgroup_subsys_state *next; WARN_ON_ONCE(!rcu_read_lock_held()); /* if first iteration, visit the leftmost descendant */ if (!pos) { - next = cgroup_leftmost_descendant(cgroup); - return next != cgroup ? next : NULL; + next = css_leftmost_descendant(root); + return next != root ? next : NULL; } /* if there's an unvisited sibling, visit its leftmost descendant */ - next = cgroup_next_child(pos, pos->parent); + next = css_next_child(pos, css_parent(pos)); if (next) - return cgroup_leftmost_descendant(next); + return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = pos->parent; - return next != cgroup ? next : NULL; + next = css_parent(pos); + return next != root ? next : NULL; } -EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); +EXPORT_SYMBOL_GPL(css_next_descendant_post); void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) @@ -4549,9 +4564,9 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) /* * Mark @cgrp dead. This prevents further task migration and child * creation by disabling cgroup_lock_live_group(). Note that - * CGRP_DEAD assertion is depended upon by cgroup_next_child() to + * CGRP_DEAD assertion is depended upon by css_next_child() to * resume iteration after dropping RCU read lock. See - * cgroup_next_child() for details. + * css_next_child() for details. */ set_bit(CGRP_DEAD, &cgrp->flags); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 19613ba51444..98ca48d9ceb4 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -50,11 +50,6 @@ static inline struct freezer *css_freezer(struct cgroup_subsys_state *css) return css ? container_of(css, struct freezer, css) : NULL; } -static inline struct freezer *cgroup_freezer(struct cgroup *cgroup) -{ - return css_freezer(cgroup_css(cgroup, freezer_subsys_id)); -} - static inline struct freezer *task_freezer(struct task_struct *task) { return css_freezer(task_css(task, freezer_subsys_id)); @@ -120,7 +115,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css) /* * The following double locking and freezing state inheritance * guarantee that @cgroup can never escape ancestors' freezing - * states. See cgroup_for_each_descendant_pre() for details. + * states. See css_for_each_descendant_pre() for details. */ if (parent) spin_lock_irq(&parent->lock); @@ -262,7 +257,7 @@ out: static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); - struct cgroup *pos; + struct cgroup_subsys_state *pos; struct cgroup_iter it; struct task_struct *task; @@ -275,8 +270,8 @@ static void update_if_frozen(struct cgroup_subsys_state *css) goto out_unlock; /* are all (live) children frozen? */ - cgroup_for_each_child(pos, css->cgroup) { - struct freezer *child = cgroup_freezer(pos); + css_for_each_child(pos, css) { + struct freezer *child = css_freezer(pos); if ((child->state & CGROUP_FREEZER_ONLINE) && !(child->state & CGROUP_FROZEN)) @@ -309,13 +304,13 @@ out_unlock: static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; rcu_read_lock(); /* update states bottom-up */ - cgroup_for_each_descendant_post(pos, css->cgroup) - update_if_frozen(cgroup_css(pos, freezer_subsys_id)); + css_for_each_descendant_post(pos, css) + update_if_frozen(pos); update_if_frozen(css); rcu_read_unlock(); @@ -396,7 +391,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze, */ static void freezer_change_state(struct freezer *freezer, bool freeze) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* update @freezer */ spin_lock_irq(&freezer->lock); @@ -409,8 +404,8 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) * CGROUP_FREEZING_PARENT. */ rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, freezer->css.cgroup) { - struct freezer *pos_f = cgroup_freezer(pos); + css_for_each_descendant_pre(pos, &freezer->css) { + struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 89b76e1d3aa1..be4f5036ea5e 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -210,29 +210,29 @@ static struct cpuset top_cpuset = { /** * cpuset_for_each_child - traverse online children of a cpuset * @child_cs: loop cursor pointing to the current child - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @parent_cs: target cpuset to walk children of * * Walk @child_cs through the online children of @parent_cs. Must be used * with RCU read locked. */ -#define cpuset_for_each_child(child_cs, pos_cgrp, parent_cs) \ - cgroup_for_each_child((pos_cgrp), (parent_cs)->css.cgroup) \ - if (is_cpuset_online(((child_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_child(child_cs, pos_css, parent_cs) \ + css_for_each_child((pos_css), &(parent_cs)->css) \ + if (is_cpuset_online(((child_cs) = css_cs((pos_css))))) /** * cpuset_for_each_descendant_pre - pre-order walk of a cpuset's descendants * @des_cs: loop cursor pointing to the current descendant - * @pos_cgrp: used for iteration + * @pos_css: used for iteration * @root_cs: target cpuset to walk ancestor of * * Walk @des_cs through the online descendants of @root_cs. Must be used - * with RCU read locked. The caller may modify @pos_cgrp by calling - * cgroup_rightmost_descendant() to skip subtree. + * with RCU read locked. The caller may modify @pos_css by calling + * css_rightmost_descendant() to skip subtree. */ -#define cpuset_for_each_descendant_pre(des_cs, pos_cgrp, root_cs) \ - cgroup_for_each_descendant_pre((pos_cgrp), (root_cs)->css.cgroup) \ - if (is_cpuset_online(((des_cs) = cgroup_cs((pos_cgrp))))) +#define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ + css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ + if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) /* * There are two global mutexes guarding cpuset structures - cpuset_mutex @@ -430,7 +430,7 @@ static void free_trial_cpuset(struct cpuset *trial) static int validate_change(struct cpuset *cur, struct cpuset *trial) { - struct cgroup *cgrp; + struct cgroup_subsys_state *css; struct cpuset *c, *par; int ret; @@ -438,7 +438,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) /* Each of our child cpusets must be a subset of us */ ret = -EBUSY; - cpuset_for_each_child(c, cgrp, cur) + cpuset_for_each_child(c, css, cur) if (!is_cpuset_subset(c, trial)) goto out; @@ -459,7 +459,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) * overlap */ ret = -EINVAL; - cpuset_for_each_child(c, cgrp, par) { + cpuset_for_each_child(c, css, par) { if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) && c != cur && cpumask_intersects(trial->cpus_allowed, c->cpus_allowed)) @@ -508,13 +508,13 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *root_cs) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } @@ -589,7 +589,7 @@ static int generate_sched_domains(cpumask_var_t **domains, struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; doms = NULL; dattr = NULL; @@ -618,7 +618,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csn = 0; rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -635,7 +635,7 @@ static int generate_sched_domains(cpumask_var_t **domains, csa[csn++] = cp; /* skip @cp's subtree */ - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); } rcu_read_unlock(); @@ -886,16 +886,16 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (update_root) update_tasks_cpumask(root_cs, heap); rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp have some CPU */ if (!cpumask_empty(cp->cpus_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget(&cp->css)) @@ -1143,16 +1143,16 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, bool update_root, struct ptr_heap *heap) { struct cpuset *cp; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (update_root) update_tasks_nodemask(root_cs, heap); rcu_read_lock(); - cpuset_for_each_descendant_pre(cp, pos_cgrp, root_cs) { + cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { /* skip the whole subtree if @cp have some CPU */ if (!nodes_empty(cp->mems_allowed)) { - pos_cgrp = cgroup_rightmost_descendant(pos_cgrp); + pos_css = css_rightmost_descendant(pos_css); continue; } if (!css_tryget(&cp->css)) @@ -1973,7 +1973,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) struct cpuset *cs = css_cs(css); struct cpuset *parent = parent_cs(cs); struct cpuset *tmp_cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; if (!parent) return 0; @@ -2005,7 +2005,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) * (and likewise for mems) to the new cgroup. */ rcu_read_lock(); - cpuset_for_each_child(tmp_cs, pos_cgrp, parent) { + cpuset_for_each_child(tmp_cs, pos_css, parent) { if (is_mem_exclusive(tmp_cs) || is_cpu_exclusive(tmp_cs)) { rcu_read_unlock(); goto out_unlock; @@ -2252,10 +2252,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* if cpus or mems changed, we need to propagate to descendants */ if (cpus_updated || mems_updated) { struct cpuset *cs; - struct cgroup *pos_cgrp; + struct cgroup_subsys_state *pos_css; rcu_read_lock(); - cpuset_for_each_descendant_pre(cs, pos_cgrp, &top_cpuset) { + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { if (!css_tryget(&cs->css)) continue; rcu_read_unlock(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ab64dfc84f8c..2285319e23a9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1082,7 +1082,7 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, struct mem_cgroup *last_visited) { - struct cgroup *prev_cgroup, *next_cgroup; + struct cgroup_subsys_state *prev_css, *next_css; /* * Root is not visited by cgroup iterators so it needs an @@ -1091,11 +1091,9 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, if (!last_visited) return root; - prev_cgroup = (last_visited == root) ? NULL - : last_visited->css.cgroup; + prev_css = (last_visited == root) ? NULL : &last_visited->css; skip_node: - next_cgroup = cgroup_next_descendant_pre( - prev_cgroup, root->css.cgroup); + next_css = css_next_descendant_pre(prev_css, &root->css); /* * Even if we found a group we have to make sure it is @@ -1104,13 +1102,13 @@ skip_node: * last_visited css is safe to use because it is * protected by css_get and the tree walk is rcu safe. */ - if (next_cgroup) { - struct mem_cgroup *mem = mem_cgroup_from_cont( - next_cgroup); + if (next_css) { + struct mem_cgroup *mem = mem_cgroup_from_css(next_css); + if (css_tryget(&mem->css)) return mem; else { - prev_cgroup = next_cgroup; + prev_css = next_css; goto skip_node; } } @@ -4939,10 +4937,10 @@ static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) */ static inline bool __memcg_has_children(struct mem_cgroup *memcg) { - struct cgroup *pos; + struct cgroup_subsys_state *pos; /* bounce at first found */ - cgroup_for_each_child(pos, memcg->css.cgroup) + css_for_each_child(pos, &memcg->css) return true; return false; } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index e0ca464fa854..9bf230aa28b0 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -56,11 +56,6 @@ static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) return s ? container_of(s, struct dev_cgroup, css) : NULL; } -static inline struct dev_cgroup *cgroup_to_devcgroup(struct cgroup *cgroup) -{ - return css_to_devcgroup(cgroup_css(cgroup, devices_subsys_id)); -} - static inline struct dev_cgroup *task_devcgroup(struct task_struct *task) { return css_to_devcgroup(task_css(task, devices_subsys_id)); @@ -447,13 +442,13 @@ static void revalidate_active_exceptions(struct dev_cgroup *devcg) static int propagate_exception(struct dev_cgroup *devcg_root, struct dev_exception_item *ex) { - struct cgroup *root = devcg_root->css.cgroup, *pos; + struct cgroup_subsys_state *pos; int rc = 0; rcu_read_lock(); - cgroup_for_each_descendant_pre(pos, root) { - struct dev_cgroup *devcg = cgroup_to_devcgroup(pos); + css_for_each_descendant_pre(pos, &devcg_root->css) { + struct dev_cgroup *devcg = css_to_devcgroup(pos); /* * Because devcgroup_mutex is held, no devcg will become -- cgit v1.3-8-gc7d7 From d515876e9d951d8cf7fc7c90db2967664bdc89ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: relocate cgroup_advance_iter() For some reason, cgroup_advance_iter() is standing lonely all away from its iter comrades. Relocate it. This is cosmetic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 48 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 91eac33fac86..d56d9363d4b3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2981,30 +2981,6 @@ int cgroup_task_count(const struct cgroup *cgrp) return count; } -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set - */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) -{ - struct list_head *l = it->cset_link; - struct cgrp_cset_link *link; - struct css_set *cset; - - /* Advance to the next non-empty css_set */ - do { - l = l->next; - if (l == &cgrp->cset_links) { - it->cset_link = NULL; - return; - } - link = list_entry(l, struct cgrp_cset_link, cset_link); - cset = link->cset; - } while (list_empty(&cset->tasks)); - it->cset_link = l; - it->task = cset->tasks.next; -} - /* * To reduce the fork() overhead for systems that are not actually * using their cgroups capability, we don't maintain the lists running @@ -3223,6 +3199,30 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); +/* + * Advance a list_head iterator. The iterator should be positioned at + * the start of a css_set + */ +static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +{ + struct list_head *l = it->cset_link; + struct cgrp_cset_link *link; + struct css_set *cset; + + /* Advance to the next non-empty css_set */ + do { + l = l->next; + if (l == &cgrp->cset_links) { + it->cset_link = NULL; + return; + } + link = list_entry(l, struct cgrp_cset_link, cset_link); + cset = link->cset; + } while (list_empty(&cset->tasks)); + it->cset_link = l; + it->task = cset->tasks.next; +} + void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) __acquires(css_set_lock) { -- cgit v1.3-8-gc7d7 From 0942eeeef68f9493c1bcb1a52baf612b73fcf9fb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: rename cgroup_iter to cgroup_task_iter cgroup now has multiple iterators and it's quite confusing to have something which walks over tasks of a single cgroup named cgroup_iter. Let's rename it to cgroup_task_iter. While at it, reformat / update comments and replace the overview comment above the interface function decls with proper function comments. Such overview can be useful but function comments should be more than enough here. This is pure rename and doesn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- include/linux/cgroup.h | 31 ++++--------- kernel/cgroup.c | 114 ++++++++++++++++++++++++++++++++---------------- kernel/cgroup_freezer.c | 24 +++++----- mm/memcontrol.c | 10 ++--- 4 files changed, 102 insertions(+), 77 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 4bc22f4a1abb..ea439794bd9b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -889,31 +889,16 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) -/* A cgroup_iter should be treated as an opaque object */ -struct cgroup_iter { - struct list_head *cset_link; - struct list_head *task; +/* A cgroup_task_iter should be treated as an opaque object */ +struct cgroup_task_iter { + struct list_head *cset_link; + struct list_head *task; }; -/* - * To iterate across the tasks in a cgroup: - * - * 1) call cgroup_iter_start to initialize an iterator - * - * 2) call cgroup_iter_next() to retrieve member tasks until it - * returns NULL or until you want to end the iteration - * - * 3) call cgroup_iter_end() to destroy the iterator. - * - * Or, call cgroup_scan_tasks() to iterate through every task in a - * cgroup - cgroup_scan_tasks() holds the css_set_lock when calling - * the test_task() callback, but not while calling the process_task() - * callback. - */ -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it); -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it); -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it); +void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); +struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, + struct cgroup_task_iter *it); +void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d56d9363d4b3..15c93f9c9e57 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -367,9 +367,11 @@ static struct cgrp_cset_link init_cgrp_cset_link; static int cgroup_init_idr(struct cgroup_subsys *ss, struct cgroup_subsys_state *css); -/* css_set_lock protects the list of css_set objects, and the - * chain of tasks off each css_set. Nests outside task->alloc_lock - * due to cgroup_iter_start() */ +/* + * css_set_lock protects the list of css_set objects, and the chain of + * tasks off each css_set. Nests outside task->alloc_lock due to + * cgroup_task_iter_start(). + */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -394,10 +396,12 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) return key; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ +/* + * We don't maintain the lists running through each css_set to its task + * until after the first call to cgroup_task_iter_start(). This reduces + * the fork()/exit() overhead for people who have cgroups compiled into + * their kernel but not actually in use. + */ static int use_task_css_set_links __read_mostly; static void __put_css_set(struct css_set *cset, int taskexit) @@ -2982,10 +2986,10 @@ int cgroup_task_count(const struct cgroup *cgrp) } /* - * To reduce the fork() overhead for systems that are not actually - * using their cgroups capability, we don't maintain the lists running - * through each css_set to its tasks until we see the list actually - * used - in other words after the first call to cgroup_iter_start(). + * To reduce the fork() overhead for systems that are not actually using + * their cgroups capability, we don't maintain the lists running through + * each css_set to its tasks until we see the list actually used - in other + * words after the first call to cgroup_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3199,11 +3203,15 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, } EXPORT_SYMBOL_GPL(css_next_descendant_post); -/* - * Advance a list_head iterator. The iterator should be positioned at - * the start of a css_set +/** + * cgroup_advance_task_iter - advance a task itererator to the next css_set + * @cgrp: the cgroup to walk tasks of + * @it: the iterator to advance + * + * Advance @it to the next css_set to walk. */ -static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) +static void cgroup_advance_task_iter(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3223,7 +3231,21 @@ static void cgroup_advance_iter(struct cgroup *cgrp, struct cgroup_iter *it) it->task = cset->tasks.next; } -void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_start - initiate task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to use + * + * Initiate iteration through the tasks of @cgrp. The caller can call + * cgroup_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, cgroup_task_iter_end() must + * be called. + * + * Note that this function acquires a lock which is released when the + * iteration finishes. The caller can't sleep while iteration is in + * progress. + */ +void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) __acquires(css_set_lock) { /* @@ -3236,11 +3258,20 @@ void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) read_lock(&css_set_lock); it->cset_link = &cgrp->cset_links; - cgroup_advance_iter(cgrp, it); + cgroup_advance_task_iter(cgrp, it); } -struct task_struct *cgroup_iter_next(struct cgroup *cgrp, - struct cgroup_iter *it) +/** + * cgroup_task_iter_next - return the next task for the iterator + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator being iterated + * + * The "next" function for task iteration. @it should have been + * initialized via cgroup_task_iter_start(). Returns NULL when the + * iteration reaches the end. + */ +struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, + struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3254,16 +3285,25 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp, l = l->next; link = list_entry(it->cset_link, struct cgrp_cset_link, cset_link); if (l == &link->cset->tasks) { - /* We reached the end of this task list - move on to - * the next cg_cgroup_link */ - cgroup_advance_iter(cgrp, it); + /* + * We reached the end of this task list - move on to the + * next cgrp_cset_link. + */ + cgroup_advance_task_iter(cgrp, it); } else { it->task = l; } return res; } -void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) +/** + * cgroup_task_iter_end - finish task iteration + * @cgrp: the cgroup to walk tasks of + * @it: the task iterator to finish + * + * Finish task iteration started by cgroup_task_iter_start(). + */ +void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3312,7 +3352,7 @@ static inline int started_after(void *p1, void *p2) * Iterate through all the tasks in a cgroup, calling test_task() for each, * and if it returns true, call process_task() for it also. * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_iter_{start,next,end}() + * Effectively duplicates cgroup_task_iter_{start,next,end}() * but does not lock css_set_lock for the call to process_task(). * The struct cgroup_scanner may be embedded in any structure of the caller's * creation. @@ -3333,7 +3373,7 @@ static inline int started_after(void *p1, void *p2) int cgroup_scan_tasks(struct cgroup_scanner *scan) { int retval, i; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3368,8 +3408,8 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_iter_start(scan->cgrp, &it); - while ((p = cgroup_iter_next(scan->cgrp, &it))) { + cgroup_task_iter_start(scan->cgrp, &it); + while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3402,7 +3442,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(scan->cgrp, &it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3608,7 +3648,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3623,8 +3663,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3635,7 +3675,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3669,7 +3709,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *tsk; /* @@ -3683,8 +3723,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_iter_start(cgrp, &it); - while ((tsk = cgroup_iter_next(cgrp, &it))) { + cgroup_task_iter_start(cgrp, &it); + while ((tsk = cgroup_task_iter_next(cgrp, &it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3704,7 +3744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_iter_end(cgrp, &it); + cgroup_task_iter_end(cgrp, &it); err: return ret; @@ -5137,7 +5177,7 @@ void cgroup_fork(struct task_struct *child) * Adds the task to the list running through its css_set if necessary and * call the subsystem fork() callbacks. Has to be after the task is * visible on the task list in case we race with the first call to - * cgroup_iter_start() - to guarantee that the new task ends up on its + * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ void cgroup_post_fork(struct task_struct *child) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 98ca48d9ceb4..c9177f8fc661 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -258,7 +258,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -279,9 +279,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css) } /* are all tasks frozen? */ - cgroup_iter_start(css->cgroup, &it); + cgroup_task_iter_start(css->cgroup, &it); - while ((task = cgroup_iter_next(css->cgroup, &it))) { + while ((task = cgroup_task_iter_next(css->cgroup, &it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_iter_end(css->cgroup, &it); + cgroup_task_iter_end(css->cgroup, &it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -323,25 +323,25 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, static void freeze_cgroup(struct freezer *freezer) { struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) freeze_task(task); - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } static void unfreeze_cgroup(struct freezer *freezer) { struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) __thaw_task(task); - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2285319e23a9..00b055dd1d3d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1800,11 +1800,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_iter it; + struct cgroup_task_iter it; struct task_struct *task; - cgroup_iter_start(cgroup, &it); - while ((task = cgroup_iter_next(cgroup, &it))) { + cgroup_task_iter_start(cgroup, &it); + while ((task = cgroup_task_iter_next(cgroup, &it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1817,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1834,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_iter_end(cgroup, &it); + cgroup_task_iter_end(cgroup, &it); } if (!chosen) -- cgit v1.3-8-gc7d7 From c59cd3d840b1b0a8f996cbbd9132128dcaabbeb9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cgroup_task_iter remember the cgroup being iterated Currently all cgroup_task_iter functions require @cgrp to be passed in, which is superflous and increases chance of usage error. Make cgroup_task_iter remember the cgroup being iterated and drop @cgrp argument from next and end functions. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- include/linux/cgroup.h | 6 +++--- kernel/cgroup.c | 32 +++++++++++++++----------------- kernel/cgroup_freezer.c | 12 ++++++------ mm/memcontrol.c | 6 +++--- 4 files changed, 27 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index ea439794bd9b..0287fccd0f54 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -891,14 +891,14 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, /* A cgroup_task_iter should be treated as an opaque object */ struct cgroup_task_iter { + struct cgroup *origin_cgrp; struct list_head *cset_link; struct list_head *task; }; void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); -struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, - struct cgroup_task_iter *it); -void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it); +struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it); +void cgroup_task_iter_end(struct cgroup_task_iter *it); int cgroup_scan_tasks(struct cgroup_scanner *scan); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 15c93f9c9e57..abc62ea1303c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3205,13 +3205,11 @@ EXPORT_SYMBOL_GPL(css_next_descendant_post); /** * cgroup_advance_task_iter - advance a task itererator to the next css_set - * @cgrp: the cgroup to walk tasks of * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup *cgrp, - struct cgroup_task_iter *it) +static void cgroup_advance_task_iter(struct cgroup_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3220,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup *cgrp, /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &cgrp->cset_links) { + if (l == &it->origin_cgrp->cset_links) { it->cset_link = NULL; return; } @@ -3257,21 +3255,22 @@ void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); + + it->origin_cgrp = cgrp; it->cset_link = &cgrp->cset_links; - cgroup_advance_task_iter(cgrp, it); + + cgroup_advance_task_iter(it); } /** * cgroup_task_iter_next - return the next task for the iterator - * @cgrp: the cgroup to walk tasks of * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been * initialized via cgroup_task_iter_start(). Returns NULL when the * iteration reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, - struct cgroup_task_iter *it) +struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3289,7 +3288,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(cgrp, it); + cgroup_advance_task_iter(it); } else { it->task = l; } @@ -3298,12 +3297,11 @@ struct task_struct *cgroup_task_iter_next(struct cgroup *cgrp, /** * cgroup_task_iter_end - finish task iteration - * @cgrp: the cgroup to walk tasks of * @it: the task iterator to finish * * Finish task iteration started by cgroup_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup *cgrp, struct cgroup_task_iter *it) +void cgroup_task_iter_end(struct cgroup_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3409,7 +3407,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) */ heap->size = 0; cgroup_task_iter_start(scan->cgrp, &it); - while ((p = cgroup_task_iter_next(scan->cgrp, &it))) { + while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3440,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) * the heap and wasn't inserted */ } - cgroup_task_iter_end(scan->cgrp, &it); + cgroup_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3664,7 +3662,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, return -ENOMEM; /* now, populate the array */ cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3675,7 +3673,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3724,7 +3722,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) cgrp = dentry->d_fsdata; cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(cgrp, &it))) { + while ((tsk = cgroup_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3744,7 +3742,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(cgrp, &it); + cgroup_task_iter_end(&it); err: return ret; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index c9177f8fc661..e0ab9bfd679a 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -281,7 +281,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) /* are all tasks frozen? */ cgroup_task_iter_start(css->cgroup, &it); - while ((task = cgroup_task_iter_next(css->cgroup, &it))) { + while ((task = cgroup_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_task_iter_end(css->cgroup, &it); + cgroup_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -327,9 +327,9 @@ static void freeze_cgroup(struct freezer *freezer) struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) + while ((task = cgroup_task_iter_next(&it))) freeze_task(task); - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) @@ -339,9 +339,9 @@ static void unfreeze_cgroup(struct freezer *freezer) struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) + while ((task = cgroup_task_iter_next(&it))) __thaw_task(task); - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } /** diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00b055dd1d3d..5a5f4dc649f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1804,7 +1804,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, struct task_struct *task; cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(cgroup, &it))) { + while ((task = cgroup_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1817,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1834,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_task_iter_end(cgroup, &it); + cgroup_task_iter_end(&it); } if (!chosen) -- cgit v1.3-8-gc7d7 From e535837b1dae17b5a2d76ea1bc22ac1a79354624 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: remove struct cgroup_scanner cgroup_scan_tasks() takes a pointer to struct cgroup_scanner as its sole argument and the only function of that struct is packing the arguments of the function call which are consisted of five fields. It's not too unusual to pack parameters into a struct when the number of arguments gets excessive or the whole set needs to be passed around a lot, but neither holds here making it just weird. Drop struct cgroup_scanner and pass the params directly to cgroup_scan_tasks(). Note that struct cpuset_change_nodemask_arg was added to cpuset.c to pass both ->cs and ->newmems pointer to cpuset_change_nodemask() using single data pointer. This doesn't make any functional differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 16 ++++----- kernel/cgroup.c | 93 +++++++++++++++++++++++--------------------------- kernel/cpuset.c | 63 ++++++++++++++-------------------- 3 files changed, 75 insertions(+), 97 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 0287fccd0f54..8472ed576b64 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -528,15 +528,6 @@ struct cftype_set { struct cftype *cfts; }; -struct cgroup_scanner { - struct cgroup *cgrp; - int (*test_task)(struct task_struct *p, struct cgroup_scanner *scan); - void (*process_task)(struct task_struct *p, - struct cgroup_scanner *scan); - struct ptr_heap *heap; - void *data; -}; - /* * See the comment above CGRP_ROOT_SANE_BEHAVIOR for details. This * function can be called as long as @cgrp is accessible. @@ -899,7 +890,12 @@ struct cgroup_task_iter { void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it); void cgroup_task_iter_end(struct cgroup_task_iter *it); -int cgroup_scan_tasks(struct cgroup_scanner *scan); + +int cgroup_scan_tasks(struct cgroup *cgrp, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap); + int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index abc62ea1303c..7b16ddb2569b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3343,32 +3343,37 @@ static inline int started_after(void *p1, void *p2) /** * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @scan: struct cgroup_scanner containing arguments for the scan + * @cgrp: the cgroup to iterate tasks of + * @test: optional test callback + * @process: process callback + * @data: data passed to @test and @process + * @heap: optional pre-allocated heap used for task iteration * - * Arguments include pointers to callback functions test_task() and - * process_task(). - * Iterate through all the tasks in a cgroup, calling test_task() for each, - * and if it returns true, call process_task() for it also. - * The test_task pointer may be NULL, meaning always true (select all tasks). - * Effectively duplicates cgroup_task_iter_{start,next,end}() - * but does not lock css_set_lock for the call to process_task(). - * The struct cgroup_scanner may be embedded in any structure of the caller's - * creation. - * It is guaranteed that process_task() will act on every task that - * is a member of the cgroup for the duration of this call. This - * function may or may not call process_task() for tasks that exit - * or move to a different cgroup during the call, or are forked or - * move into the cgroup during the call. + * Iterate through all the tasks in a cgroup, calling @test for each, and + * if it returns %true, call @process for it also. * - * Note that test_task() may be called with locks held, and may in some - * situations be called multiple times for the same task, so it should - * be cheap. - * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been - * pre-allocated and will be used for heap operations (and its "gt" member will - * be overwritten), else a temporary heap will be used (allocation of which - * may cause this function to fail). + * @test may be NULL, meaning always true (select all tasks), which + * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * lock css_set_lock for the call to @process. + * + * It is guaranteed that @process will act on every task that is a member + * of @cgrp for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different cgroup during + * the call, or are forked or move into the cgroup during the call. + * + * Note that @test may be called with locks held, and may in some + * situations be called multiple times for the same task, so it should be + * cheap. + * + * If @heap is non-NULL, a heap has been pre-allocated and will be used for + * heap operations (and its "gt" member will be overwritten), else a + * temporary heap will be used (allocation of which may cause this function + * to fail). */ -int cgroup_scan_tasks(struct cgroup_scanner *scan) +int cgroup_scan_tasks(struct cgroup *cgrp, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; struct cgroup_task_iter it; @@ -3376,12 +3381,10 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; struct ptr_heap tmp_heap; - struct ptr_heap *heap; struct timespec latest_time = { 0, 0 }; - if (scan->heap) { + if (heap) { /* The caller supplied our heap and pre-allocated its memory */ - heap = scan->heap; heap->gt = &started_after; } else { /* We need to allocate our own heap memory */ @@ -3394,25 +3397,24 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) again: /* - * Scan tasks in the cgroup, using the scanner's "test_task" callback - * to determine which are of interest, and using the scanner's - * "process_task" callback to process any of them that need an update. - * Since we don't want to hold any locks during the task updates, - * gather tasks to be processed in a heap structure. - * The heap is sorted by descending task start time. - * If the statically-sized heap fills up, we overflow tasks that - * started later, and in future iterations only consider tasks that - * started after the latest task in the previous pass. This + * Scan tasks in the cgroup, using the @test callback to determine + * which are of interest, and invoking @process callback on the + * ones which need an update. Since we don't want to hold any + * locks during the task updates, gather tasks to be processed in a + * heap structure. The heap is sorted by descending task start + * time. If the statically-sized heap fills up, we overflow tasks + * that started later, and in future iterations only consider tasks + * that started after the latest task in the previous pass. This * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(scan->cgrp, &it); + cgroup_task_iter_start(cgrp, &it); while ((p = cgroup_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one */ - if (scan->test_task && !scan->test_task(p, scan)) + if (test && !test(p, data)) continue; /* * Only process tasks that started after the last task @@ -3450,7 +3452,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) latest_task = q; } /* Process the task per the caller's callback */ - scan->process_task(q, scan); + process(q, data); put_task_struct(q); } /* @@ -3467,10 +3469,9 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan) return 0; } -static void cgroup_transfer_one_task(struct task_struct *task, - struct cgroup_scanner *scan) +static void cgroup_transfer_one_task(struct task_struct *task, void *data) { - struct cgroup *new_cgroup = scan->data; + struct cgroup *new_cgroup = data; mutex_lock(&cgroup_mutex); cgroup_attach_task(new_cgroup, task, false); @@ -3484,15 +3485,7 @@ static void cgroup_transfer_one_task(struct task_struct *task, */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - struct cgroup_scanner scan; - - scan.cgrp = from; - scan.test_task = NULL; /* select all tasks in cgroup */ - scan.process_task = cgroup_transfer_one_task; - scan.heap = NULL; - scan.data = to; - - return cgroup_scan_tasks(&scan); + return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); } /* diff --git a/kernel/cpuset.c b/kernel/cpuset.c index be4f5036ea5e..6fe23f2ac742 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -830,7 +830,7 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) /** * cpuset_change_cpumask - make a task's cpus_allowed the same as its cpuset's * @tsk: task to test - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * * Called by cgroup_scan_tasks() for each task in a cgroup whose * cpus_allowed mask needs to be changed. @@ -838,12 +838,11 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_cpumask(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_cpumask(struct task_struct *tsk, void *data) { - struct cpuset *cpus_cs; + struct cpuset *cs = data; + struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); - cpus_cs = effective_cpumask_cpuset(cgroup_cs(scan->cgrp)); set_cpus_allowed_ptr(tsk, cpus_cs->cpus_allowed); } @@ -862,13 +861,8 @@ static void cpuset_change_cpumask(struct task_struct *tsk, */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_cpumask; - scan.heap = heap; - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_cpumask, cs, + heap); } /* @@ -1052,20 +1046,24 @@ static void cpuset_change_task_nodemask(struct task_struct *tsk, task_unlock(tsk); } +struct cpuset_change_nodemask_arg { + struct cpuset *cs; + nodemask_t *newmems; +}; + /* * Update task's mems_allowed and rebind its mempolicy and vmas' mempolicy * of it to cpuset's new mems_allowed, and migrate pages to new nodes if * memory_migrate flag is set. Called with cpuset_mutex held. */ -static void cpuset_change_nodemask(struct task_struct *p, - struct cgroup_scanner *scan) +static void cpuset_change_nodemask(struct task_struct *p, void *data) { - struct cpuset *cs = cgroup_cs(scan->cgrp); + struct cpuset_change_nodemask_arg *arg = data; + struct cpuset *cs = arg->cs; struct mm_struct *mm; int migrate; - nodemask_t *newmems = scan->data; - cpuset_change_task_nodemask(p, newmems); + cpuset_change_task_nodemask(p, arg->newmems); mm = get_task_mm(p); if (!mm) @@ -1075,7 +1073,7 @@ static void cpuset_change_nodemask(struct task_struct *p, mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) - cpuset_migrate_mm(mm, &cs->old_mems_allowed, newmems); + cpuset_migrate_mm(mm, &cs->old_mems_allowed, arg->newmems); mmput(mm); } @@ -1093,19 +1091,14 @@ static void *cpuset_being_rebound; static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { static nodemask_t newmems; /* protected by cpuset_mutex */ - struct cgroup_scanner scan; struct cpuset *mems_cs = effective_nodemask_cpuset(cs); + struct cpuset_change_nodemask_arg arg = { .cs = cs, + .newmems = &newmems }; cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ guarantee_online_mems(mems_cs, &newmems); - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_nodemask; - scan.heap = heap; - scan.data = &newmems; - /* * The mpol_rebind_mm() call takes mmap_sem, which we couldn't * take while holding tasklist_lock. Forks can happen - the @@ -1116,7 +1109,8 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_nodemask, &arg, + heap); /* * All the tasks' nodemasks have been updated, update @@ -1263,17 +1257,18 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) /* * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated - * @scan: struct cgroup_scanner containing the cgroup of the task + * @data: cpuset to @tsk belongs to * * Called by cgroup_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. */ -static void cpuset_change_flag(struct task_struct *tsk, - struct cgroup_scanner *scan) +static void cpuset_change_flag(struct task_struct *tsk, void *data) { - cpuset_update_task_spread_flag(cgroup_cs(scan->cgrp), tsk); + struct cpuset *cs = data; + + cpuset_update_task_spread_flag(cs, tsk); } /* @@ -1291,13 +1286,7 @@ static void cpuset_change_flag(struct task_struct *tsk, */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - struct cgroup_scanner scan; - - scan.cgrp = cs->css.cgroup; - scan.test_task = NULL; - scan.process_task = cpuset_change_flag; - scan.heap = heap; - cgroup_scan_tasks(&scan); + cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_flag, cs, heap); } /* -- cgit v1.3-8-gc7d7 From 72ec7029937f0518eff21b8762743c31591684f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make task iterators deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. This patch converts task iterators to deal with css instead of cgroup. Note that under unified hierarchy, different sets of tasks will be considered belonging to a given cgroup depending on the subsystem in question and making the iterators deal with css instead cgroup provides them with enough information about the iteration. While at it, fix several function comment formats in cpuset.c. This patch doesn't introduce any behavior differences. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh Cc: Matt Helsley --- include/linux/cgroup.h | 21 ++++----- kernel/cgroup.c | 112 ++++++++++++++++++++++++------------------------ kernel/cgroup_freezer.c | 26 ++++++----- kernel/cpuset.c | 41 ++++++++---------- mm/memcontrol.c | 11 +++-- 5 files changed, 104 insertions(+), 107 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8472ed576b64..cd105fce089c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -880,21 +880,22 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) -/* A cgroup_task_iter should be treated as an opaque object */ -struct cgroup_task_iter { - struct cgroup *origin_cgrp; +/* A css_task_iter should be treated as an opaque object */ +struct css_task_iter { + struct cgroup_subsys_state *origin_css; struct list_head *cset_link; struct list_head *task; }; -void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it); -struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it); -void cgroup_task_iter_end(struct cgroup_task_iter *it); +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it); +struct task_struct *css_task_iter_next(struct css_task_iter *it); +void css_task_iter_end(struct css_task_iter *it); -int cgroup_scan_tasks(struct cgroup *cgrp, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap); +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b16ddb2569b..8c57301d0561 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -370,7 +370,7 @@ static int cgroup_init_idr(struct cgroup_subsys *ss, /* * css_set_lock protects the list of css_set objects, and the chain of * tasks off each css_set. Nests outside task->alloc_lock due to - * cgroup_task_iter_start(). + * css_task_iter_start(). */ static DEFINE_RWLOCK(css_set_lock); static int css_set_count; @@ -398,9 +398,9 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) /* * We don't maintain the lists running through each css_set to its task - * until after the first call to cgroup_task_iter_start(). This reduces - * the fork()/exit() overhead for people who have cgroups compiled into - * their kernel but not actually in use. + * until after the first call to css_task_iter_start(). This reduces the + * fork()/exit() overhead for people who have cgroups compiled into their + * kernel but not actually in use. */ static int use_task_css_set_links __read_mostly; @@ -2989,7 +2989,7 @@ int cgroup_task_count(const struct cgroup *cgrp) * To reduce the fork() overhead for systems that are not actually using * their cgroups capability, we don't maintain the lists running through * each css_set to its tasks until we see the list actually used - in other - * words after the first call to cgroup_task_iter_start(). + * words after the first call to css_task_iter_start(). */ static void cgroup_enable_task_cg_lists(void) { @@ -3204,12 +3204,12 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, EXPORT_SYMBOL_GPL(css_next_descendant_post); /** - * cgroup_advance_task_iter - advance a task itererator to the next css_set + * css_advance_task_iter - advance a task itererator to the next css_set * @it: the iterator to advance * * Advance @it to the next css_set to walk. */ -static void cgroup_advance_task_iter(struct cgroup_task_iter *it) +static void css_advance_task_iter(struct css_task_iter *it) { struct list_head *l = it->cset_link; struct cgrp_cset_link *link; @@ -3218,7 +3218,7 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) /* Advance to the next non-empty css_set */ do { l = l->next; - if (l == &it->origin_cgrp->cset_links) { + if (l == &it->origin_css->cgroup->cset_links) { it->cset_link = NULL; return; } @@ -3230,47 +3230,48 @@ static void cgroup_advance_task_iter(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_start - initiate task iteration - * @cgrp: the cgroup to walk tasks of + * css_task_iter_start - initiate task iteration + * @css: the css to walk tasks of * @it: the task iterator to use * - * Initiate iteration through the tasks of @cgrp. The caller can call - * cgroup_task_iter_next() to walk through the tasks until the function - * returns NULL. On completion of iteration, cgroup_task_iter_end() must - * be called. + * Initiate iteration through the tasks of @css. The caller can call + * css_task_iter_next() to walk through the tasks until the function + * returns NULL. On completion of iteration, css_task_iter_end() must be + * called. * * Note that this function acquires a lock which is released when the * iteration finishes. The caller can't sleep while iteration is in * progress. */ -void cgroup_task_iter_start(struct cgroup *cgrp, struct cgroup_task_iter *it) +void css_task_iter_start(struct cgroup_subsys_state *css, + struct css_task_iter *it) __acquires(css_set_lock) { /* - * The first time anyone tries to iterate across a cgroup, - * we need to enable the list linking each css_set to its - * tasks, and fix up all existing tasks. + * The first time anyone tries to iterate across a css, we need to + * enable the list linking each css_set to its tasks, and fix up + * all existing tasks. */ if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); read_lock(&css_set_lock); - it->origin_cgrp = cgrp; - it->cset_link = &cgrp->cset_links; + it->origin_css = css; + it->cset_link = &css->cgroup->cset_links; - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } /** - * cgroup_task_iter_next - return the next task for the iterator + * css_task_iter_next - return the next task for the iterator * @it: the task iterator being iterated * * The "next" function for task iteration. @it should have been - * initialized via cgroup_task_iter_start(). Returns NULL when the - * iteration reaches the end. + * initialized via css_task_iter_start(). Returns NULL when the iteration + * reaches the end. */ -struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) +struct task_struct *css_task_iter_next(struct css_task_iter *it) { struct task_struct *res; struct list_head *l = it->task; @@ -3288,7 +3289,7 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) * We reached the end of this task list - move on to the * next cgrp_cset_link. */ - cgroup_advance_task_iter(it); + css_advance_task_iter(it); } else { it->task = l; } @@ -3296,12 +3297,12 @@ struct task_struct *cgroup_task_iter_next(struct cgroup_task_iter *it) } /** - * cgroup_task_iter_end - finish task iteration + * css_task_iter_end - finish task iteration * @it: the task iterator to finish * - * Finish task iteration started by cgroup_task_iter_start(). + * Finish task iteration started by css_task_iter_start(). */ -void cgroup_task_iter_end(struct cgroup_task_iter *it) +void css_task_iter_end(struct css_task_iter *it) __releases(css_set_lock) { read_unlock(&css_set_lock); @@ -3342,24 +3343,24 @@ static inline int started_after(void *p1, void *p2) } /** - * cgroup_scan_tasks - iterate though all the tasks in a cgroup - * @cgrp: the cgroup to iterate tasks of + * css_scan_tasks - iterate though all the tasks in a css + * @css: the css to iterate tasks of * @test: optional test callback * @process: process callback * @data: data passed to @test and @process * @heap: optional pre-allocated heap used for task iteration * - * Iterate through all the tasks in a cgroup, calling @test for each, and - * if it returns %true, call @process for it also. + * Iterate through all the tasks in @css, calling @test for each, and if it + * returns %true, call @process for it also. * * @test may be NULL, meaning always true (select all tasks), which - * effectively duplicates cgroup_task_iter_{start,next,end}() but does not + * effectively duplicates css_task_iter_{start,next,end}() but does not * lock css_set_lock for the call to @process. * * It is guaranteed that @process will act on every task that is a member - * of @cgrp for the duration of this call. This function may or may not - * call @process for tasks that exit or move to a different cgroup during - * the call, or are forked or move into the cgroup during the call. + * of @css for the duration of this call. This function may or may not + * call @process for tasks that exit or move to a different css during the + * call, or are forked or move into the css during the call. * * Note that @test may be called with locks held, and may in some * situations be called multiple times for the same task, so it should be @@ -3370,13 +3371,13 @@ static inline int started_after(void *p1, void *p2) * temporary heap will be used (allocation of which may cause this function * to fail). */ -int cgroup_scan_tasks(struct cgroup *cgrp, - bool (*test)(struct task_struct *, void *), - void (*process)(struct task_struct *, void *), - void *data, struct ptr_heap *heap) +int css_scan_tasks(struct cgroup_subsys_state *css, + bool (*test)(struct task_struct *, void *), + void (*process)(struct task_struct *, void *), + void *data, struct ptr_heap *heap) { int retval, i; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *p, *dropped; /* Never dereference latest_task, since it's not refcounted */ struct task_struct *latest_task = NULL; @@ -3397,7 +3398,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, again: /* - * Scan tasks in the cgroup, using the @test callback to determine + * Scan tasks in the css, using the @test callback to determine * which are of interest, and invoking @process callback on the * ones which need an update. Since we don't want to hold any * locks during the task updates, gather tasks to be processed in a @@ -3408,8 +3409,8 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * guarantees forward progress and that we don't miss any tasks. */ heap->size = 0; - cgroup_task_iter_start(cgrp, &it); - while ((p = cgroup_task_iter_next(&it))) { + css_task_iter_start(css, &it); + while ((p = css_task_iter_next(&it))) { /* * Only affect tasks that qualify per the caller's callback, * if he provided one @@ -3442,7 +3443,7 @@ int cgroup_scan_tasks(struct cgroup *cgrp, * the heap and wasn't inserted */ } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); if (heap->size) { for (i = 0; i < heap->size; i++) { @@ -3485,7 +3486,8 @@ static void cgroup_transfer_one_task(struct task_struct *task, void *data) */ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) { - return cgroup_scan_tasks(from, NULL, cgroup_transfer_one_task, to, NULL); + return css_scan_tasks(&from->dummy_css, NULL, cgroup_transfer_one_task, + to, NULL); } /* @@ -3639,7 +3641,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, pid_t *array; int length; int pid, n = 0; /* used for populating the array */ - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; struct cgroup_pidlist *l; @@ -3654,8 +3656,8 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (!array) return -ENOMEM; /* now, populate the array */ - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { if (unlikely(n == length)) break; /* get tgid or pid for procs or tasks file respectively */ @@ -3666,7 +3668,7 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, if (pid > 0) /* make sure to only use valid results */ array[n++] = pid; } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ sort(array, length, sizeof(pid_t), cmppid, NULL); @@ -3700,7 +3702,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { int ret = -EINVAL; struct cgroup *cgrp; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *tsk; /* @@ -3714,8 +3716,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) ret = 0; cgrp = dentry->d_fsdata; - cgroup_task_iter_start(cgrp, &it); - while ((tsk = cgroup_task_iter_next(&it))) { + css_task_iter_start(&cgrp->dummy_css, &it); + while ((tsk = css_task_iter_next(&it))) { switch (tsk->state) { case TASK_RUNNING: stats->nr_running++; @@ -3735,7 +3737,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) break; } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); err: return ret; diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index e0ab9bfd679a..5cd2b6d55243 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -258,7 +258,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) { struct freezer *freezer = css_freezer(css); struct cgroup_subsys_state *pos; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; WARN_ON_ONCE(!rcu_read_lock_held()); @@ -279,9 +279,9 @@ static void update_if_frozen(struct cgroup_subsys_state *css) } /* are all tasks frozen? */ - cgroup_task_iter_start(css->cgroup, &it); + css_task_iter_start(css, &it); - while ((task = cgroup_task_iter_next(&it))) { + while ((task = css_task_iter_next(&it))) { if (freezing(task)) { /* * freezer_should_skip() indicates that the task @@ -296,7 +296,7 @@ static void update_if_frozen(struct cgroup_subsys_state *css) freezer->state |= CGROUP_FROZEN; out_iter_end: - cgroup_task_iter_end(&it); + css_task_iter_end(&it); out_unlock: spin_unlock_irq(&freezer->lock); } @@ -322,26 +322,24 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, static void freeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) freeze_task(task); - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } static void unfreeze_cgroup(struct freezer *freezer) { - struct cgroup *cgroup = freezer->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) + css_task_iter_start(&freezer->css, &it); + while ((task = css_task_iter_next(&it))) __thaw_task(task); - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6fe23f2ac742..39e52175f4af 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -832,8 +832,8 @@ static struct cpuset *effective_nodemask_cpuset(struct cpuset *cs) * @tsk: task to test * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup whose - * cpus_allowed mask needs to be changed. + * Called by css_scan_tasks() for each task in a cgroup whose cpus_allowed + * mask needs to be changed. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. @@ -849,27 +849,26 @@ static void cpuset_change_cpumask(struct task_struct *tsk, void *data) /** * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) { - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_cpumask, cs, - heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_cpumask, cs, heap); } /* * update_tasks_cpumask_hier - Update the cpumasks of tasks in the hierarchy. * @root_cs: the root cpuset of the hierarchy * @update_root: update root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update cpumasks of tasks in @root_cs and all other empty cpusets * which take on cpumask of @root_cs. @@ -1082,11 +1081,10 @@ static void *cpuset_being_rebound; /** * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. * @cs: the cpuset in which each task's mems_allowed mask needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * - * Called with cpuset_mutex held - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 - * if @heap != NULL. + * Called with cpuset_mutex held. No return value. It's guaranteed that + * css_scan_tasks() always returns 0 if @heap != NULL. */ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) { @@ -1109,8 +1107,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * It's ok if we rebind the same mm twice; mpol_rebind_mm() * is idempotent. Also migrate pages in each mm to new nodes. */ - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_nodemask, &arg, - heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_nodemask, &arg, heap); /* * All the tasks' nodemasks have been updated, update @@ -1126,7 +1123,7 @@ static void update_tasks_nodemask(struct cpuset *cs, struct ptr_heap *heap) * update_tasks_nodemask_hier - Update the nodemasks of tasks in the hierarchy. * @cs: the root cpuset of the hierarchy * @update_root: update the root cpuset or not? - * @heap: the heap used by cgroup_scan_tasks() + * @heap: the heap used by css_scan_tasks() * * This will update nodemasks of tasks in @root_cs and all other empty cpusets * which take on nodemask of @root_cs. @@ -1254,12 +1251,12 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) return 0; } -/* +/** * cpuset_change_flag - make a task's spread flags the same as its cpuset's * @tsk: task to be updated * @data: cpuset to @tsk belongs to * - * Called by cgroup_scan_tasks() for each task in a cgroup. + * Called by css_scan_tasks() for each task in a cgroup. * * We don't need to re-check for the cgroup/cpuset membership, since we're * holding cpuset_mutex at this point. @@ -1271,22 +1268,22 @@ static void cpuset_change_flag(struct task_struct *tsk, void *data) cpuset_update_task_spread_flag(cs, tsk); } -/* +/** * update_tasks_flags - update the spread flags of tasks in the cpuset. * @cs: the cpuset in which each task's spread flags needs to be changed - * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() + * @heap: if NULL, defer allocating heap memory to css_scan_tasks() * * Called with cpuset_mutex held * - * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, + * The css_scan_tasks() function will scan all the tasks in a cgroup, * calling callback functions for each. * - * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 + * No return value. It's guaranteed that css_scan_tasks() always returns 0 * if @heap != NULL. */ static void update_tasks_flags(struct cpuset *cs, struct ptr_heap *heap) { - cgroup_scan_tasks(cs->css.cgroup, NULL, cpuset_change_flag, cs, heap); + css_scan_tasks(&cs->css, NULL, cpuset_change_flag, cs, heap); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5a5f4dc649f0..95106a993777 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1799,12 +1799,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { - struct cgroup *cgroup = iter->css.cgroup; - struct cgroup_task_iter it; + struct css_task_iter it; struct task_struct *task; - cgroup_task_iter_start(cgroup, &it); - while ((task = cgroup_task_iter_next(&it))) { + css_task_iter_start(&iter->css, &it); + while ((task = css_task_iter_next(&it))) { switch (oom_scan_process_thread(task, totalpages, NULL, false)) { case OOM_SCAN_SELECT: @@ -1817,7 +1816,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: - cgroup_task_iter_end(&it); + css_task_iter_end(&it); mem_cgroup_iter_break(memcg, iter); if (chosen) put_task_struct(chosen); @@ -1834,7 +1833,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, get_task_struct(chosen); } } - cgroup_task_iter_end(&it); + css_task_iter_end(&it); } if (!chosen) -- cgit v1.3-8-gc7d7 From 81eeaf0411204f52af8ef78ff107cfca2fcfec1d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:26 -0400 Subject: cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cftype->[un]register_event() is among the remaining couple interfaces which still use struct cgroup. Convert it to cgroup_subsys_state. The conversion is mostly mechanical and removes the last users of mem_cgroup_from_cont() and cg_to_vmpressure(), which are removed. v2: indentation update as suggested by Li Zefan. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Balbir Singh --- include/linux/cgroup.h | 10 ++++++---- include/linux/vmpressure.h | 6 ++++-- kernel/cgroup.c | 15 ++++++++------- mm/memcontrol.c | 21 ++++++++------------- mm/vmpressure.c | 21 +++++++++------------ 5 files changed, 35 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index cd105fce089c..b065d24486e6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -506,16 +506,18 @@ struct cftype { * you want to provide this functionality. Use eventfd_signal() * on eventfd to send notification to userspace. */ - int (*register_event)(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args); + int (*register_event)(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args); /* * unregister_event() callback will be called when userspace * closes the eventfd or on cgroup removing. * This callback must be implemented, if you want provide * notification functionality. */ - void (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, - struct eventfd_ctx *eventfd); + void (*unregister_event)(struct cgroup_subsys_state *css, + struct cftype *cft, + struct eventfd_ctx *eventfd); }; /* diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h index 76be077340ea..b239482bd39d 100644 --- a/include/linux/vmpressure.h +++ b/include/linux/vmpressure.h @@ -33,10 +33,12 @@ extern void vmpressure_init(struct vmpressure *vmpr); extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg); extern struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr); extern struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css); -extern int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, +extern int vmpressure_register_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args); -extern void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, +extern void vmpressure_unregister_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd); #else static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8c57301d0561..a71f2e0f9711 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -159,9 +159,9 @@ struct css_id { */ struct cgroup_event { /* - * Cgroup which the event belongs to. + * css which the event belongs to. */ - struct cgroup *cgrp; + struct cgroup_subsys_state *css; /* * Control file which the event associated. */ @@ -3955,11 +3955,12 @@ static void cgroup_event_remove(struct work_struct *work) { struct cgroup_event *event = container_of(work, struct cgroup_event, remove); - struct cgroup *cgrp = event->cgrp; + struct cgroup_subsys_state *css = event->css; + struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); - event->cft->unregister_event(cgrp, event->cft, event->eventfd); + event->cft->unregister_event(css, event->cft, event->eventfd); /* Notify userspace the event is going away. */ eventfd_signal(event->eventfd, 1); @@ -3979,7 +3980,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, { struct cgroup_event *event = container_of(wait, struct cgroup_event, wait); - struct cgroup *cgrp = event->cgrp; + struct cgroup *cgrp = event->css->cgroup; unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { @@ -4048,7 +4049,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->cgrp = cgrp; + event->css = css; INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4099,7 +4100,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(cgrp, event->cft, + ret = event->cft->register_event(css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 95106a993777..2885e3e85047 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,11 +1034,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) preempt_enable(); } -static inline struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) -{ - return mem_cgroup_from_css(cgroup_css(cont, mem_cgroup_subsys_id)); -} - struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) { /* @@ -5620,10 +5615,10 @@ static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) mem_cgroup_oom_notify_cb(iter); } -static int mem_cgroup_usage_register_event(struct cgroup *cgrp, +static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5703,10 +5698,10 @@ unlock: return ret; } -static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_thresholds *thresholds; struct mem_cgroup_threshold_ary *new; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5782,10 +5777,10 @@ unlock: mutex_unlock(&memcg->thresholds_lock); } -static int mem_cgroup_oom_register_event(struct cgroup *cgrp, +static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *event; enum res_type type = MEMFILE_TYPE(cft->private); @@ -5807,10 +5802,10 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, return 0; } -static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, +static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_eventfd_list *ev, *tmp; enum res_type type = MEMFILE_TYPE(cft->private); diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 2a8a736e95cc..13489b1f4ee3 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -74,11 +74,6 @@ static struct vmpressure *work_to_vmpressure(struct work_struct *work) return container_of(work, struct vmpressure, work); } -static struct vmpressure *cg_to_vmpressure(struct cgroup *cg) -{ - return css_to_vmpressure(cgroup_css(cg, mem_cgroup_subsys_id)); -} - static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr) { struct cgroup_subsys_state *css = vmpressure_to_css(vmpr); @@ -283,7 +278,7 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) /** * vmpressure_register_event() - Bind vmpressure notifications to an eventfd - * @cg: cgroup that is interested in vmpressure notifications + * @css: css that is interested in vmpressure notifications * @cft: cgroup control files handle * @eventfd: eventfd context to link notifications with * @args: event arguments (used to set up a pressure level threshold) @@ -298,10 +293,11 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) * cftype).register_event, and then cgroup core will handle everything by * itself. */ -int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, - struct eventfd_ctx *eventfd, const char *args) +int vmpressure_register_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd, + const char *args) { - struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; int level; @@ -329,7 +325,7 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, /** * vmpressure_unregister_event() - Unbind eventfd from vmpressure - * @cg: cgroup handle + * @css: css handle * @cft: cgroup control files handle * @eventfd: eventfd context that was used to link vmpressure with the @cg * @@ -341,10 +337,11 @@ int vmpressure_register_event(struct cgroup *cg, struct cftype *cft, * cftype).unregister_event, and then cgroup core will handle everything * by itself. */ -void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, +void vmpressure_unregister_event(struct cgroup_subsys_state *css, + struct cftype *cft, struct eventfd_ctx *eventfd) { - struct vmpressure *vmpr = cg_to_vmpressure(cg); + struct vmpressure *vmpr = css_to_vmpressure(css); struct vmpressure_event *ev; mutex_lock(&vmpr->events_lock); -- cgit v1.3-8-gc7d7 From d99c8727e7bbc01b70e2c57e6127bfab26b868fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make cgroup_taskset deal with cgroup_subsys_state instead of cgroup cgroup is in the process of converting to css (cgroup_subsys_state) from cgroup as the principal subsystem interface handle. This is mostly to prepare for the unified hierarchy support where css's will be created and destroyed dynamically but also helps cleaning up subsystem implementations as css is usually what they are interested in anyway. cgroup_taskset which is used by the subsystem attach methods is the last cgroup subsystem API which isn't using css as the handle. Update cgroup_taskset_cur_cgroup() to cgroup_taskset_cur_css() and cgroup_taskset_for_each() to take @skip_css instead of @skip_cgrp. The conversions are pretty mechanical. One exception is cpuset::cgroup_cs(), which lost its last user and got removed. This patch shouldn't introduce any functional changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Daniel Wagner Cc: Ingo Molnar Cc: Matt Helsley Cc: Steven Rostedt --- block/blk-cgroup.c | 2 +- include/linux/cgroup.h | 12 +++++++----- kernel/cgroup.c | 16 +++++++++------- kernel/cgroup_freezer.c | 2 +- kernel/cpuset.c | 15 +++++---------- kernel/events/core.c | 2 +- kernel/sched/core.c | 4 ++-- net/core/netprio_cgroup.c | 2 +- net/sched/cls_cgroup.c | 2 +- 9 files changed, 28 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b40640240a4..54ad00292edf 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -891,7 +891,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css, int ret = 0; /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { task_lock(task); ioc = task->io_context; if (ioc && atomic_read(&ioc->nr_tasks) > 1) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b065d24486e6..d9a970568be9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -562,20 +562,22 @@ int cgroup_task_count(const struct cgroup *cgrp); struct cgroup_taskset; struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset); -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset); +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id); int cgroup_taskset_size(struct cgroup_taskset *tset); /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor - * @skip_cgrp: skip if task's cgroup matches this, %NULL to iterate through all + * @skip_css: skip if task's css matches this, %NULL to iterate through all * @tset: taskset to iterate */ -#define cgroup_taskset_for_each(task, skip_cgrp, tset) \ +#define cgroup_taskset_for_each(task, skip_css, tset) \ for ((task) = cgroup_taskset_first((tset)); (task); \ (task) = cgroup_taskset_next((tset))) \ - if (!(skip_cgrp) || \ - cgroup_taskset_cur_cgroup((tset)) != (skip_cgrp)) + if (!(skip_css) || \ + cgroup_taskset_cur_css((tset), \ + (skip_css)->ss->subsys_id) != (skip_css)) /* * Control Group subsystem type. diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a71f2e0f9711..e5bfb2a81dcb 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1907,18 +1907,20 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) EXPORT_SYMBOL_GPL(cgroup_taskset_next); /** - * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task + * cgroup_taskset_cur_css - return the matching css for the current task * @tset: taskset of interest + * @subsys_id: the ID of the target subsystem * - * Return the cgroup for the current (last returned) task of @tset. This - * function must be preceded by either cgroup_taskset_first() or - * cgroup_taskset_next(). + * Return the css for the current (last returned) task of @tset for + * subsystem specified by @subsys_id. This function must be preceded by + * either cgroup_taskset_first() or cgroup_taskset_next(). */ -struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) +struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, + int subsys_id) { - return tset->cur_cgrp; + return cgroup_css(tset->cur_cgrp, subsys_id); } -EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); +EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); /** * cgroup_taskset_size - return the number of tasks in taskset diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 5cd2b6d55243..224da9aa27f5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -189,7 +189,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, new_css->cgroup, tset) { + cgroup_taskset_for_each(task, new_css, tset) { if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 39e52175f4af..bf69717325b4 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -119,12 +119,6 @@ static inline struct cpuset *css_cs(struct cgroup_subsys_state *css) return css ? container_of(css, struct cpuset, css) : NULL; } -/* Retrieve the cpuset for a cgroup */ -static inline struct cpuset *cgroup_cs(struct cgroup *cgrp) -{ - return css_cs(cgroup_css(cgrp, cpuset_subsys_id)); -} - /* Retrieve the cpuset for a task */ static inline struct cpuset *task_cs(struct task_struct *task) { @@ -1459,7 +1453,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * Kthreads which disallow setaffinity shouldn't be moved * to a new cpuset; we don't want to change their cpu @@ -1511,9 +1505,10 @@ static void cpuset_attach(struct cgroup_subsys_state *css, struct mm_struct *mm; struct task_struct *task; struct task_struct *leader = cgroup_taskset_first(tset); - struct cgroup *oldcgrp = cgroup_taskset_cur_cgroup(tset); + struct cgroup_subsys_state *oldcss = cgroup_taskset_cur_css(tset, + cpuset_subsys_id); struct cpuset *cs = css_cs(css); - struct cpuset *oldcs = cgroup_cs(oldcgrp); + struct cpuset *oldcs = css_cs(oldcss); struct cpuset *cpus_cs = effective_cpumask_cpuset(cs); struct cpuset *mems_cs = effective_nodemask_cpuset(cs); @@ -1527,7 +1522,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(mems_cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here diff --git a/kernel/events/core.c b/kernel/events/core.c index 9705a0ed1dce..c199c4f24910 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7816,7 +7816,7 @@ static void perf_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cc9a49266382..a7122d5b8310 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7135,7 +7135,7 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -7153,7 +7153,7 @@ static void cpu_cgroup_attach(struct cgroup_subsys_state *css, { struct task_struct *task; - cgroup_taskset_for_each(task, css->cgroup, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index e00f60e5baea..d9cd627e6a16 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -224,7 +224,7 @@ static void net_prio_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v; - cgroup_taskset_for_each(p, css->cgroup, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_netprioidx(p); iterate_fd(p->files, 0, update_netprio, v); diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index 8ea1184cec92..867b4a3e3980 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -74,7 +74,7 @@ static void cgrp_attach(struct cgroup_subsys_state *css, struct task_struct *p; void *v; - cgroup_taskset_for_each(p, css->cgroup, tset) { + cgroup_taskset_for_each(p, css, tset) { task_lock(p); v = (void *)(unsigned long)task_cls_classid(p); iterate_fd(p->files, 0, update_classid, v); -- cgit v1.3-8-gc7d7 From 95109b627ba6a043c181fa5fa45d1c754dd44fbc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: unexport cgroup_css() cgroup_css() no longer has any user left outside cgroup.c proper and we don't want subsystems to grow new usages of the function. cgroup core should always provide the css to use to the subsystems, which will make dynamic creation and destruction of css's across the lifetime of a cgroup much more manageable than exposing the cgroup directly to subsystems and let them dereference css's from it. Make cgroup_css() a static function in cgroup.c. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 13 ------------- kernel/cgroup.c | 13 +++++++++++++ 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d9a970568be9..c40e508d54e9 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -677,19 +677,6 @@ struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) return &parent_cgrp->dummy_css; } -/** - * cgroup_css - obtain a cgroup's css for the specified subsystem - * @cgrp: the cgroup of interest - * @subsys_id: the subsystem of interest - * - * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. - */ -static inline struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - int subsys_id) -{ - return cgrp->subsys[subsys_id]; -} - /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e5bfb2a81dcb..c02a288a4e3d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -222,6 +222,19 @@ static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); +/** + * cgroup_css - obtain a cgroup's css for the specified subsystem + * @cgrp: the cgroup of interest + * @subsys_id: the subsystem of interest + * + * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + */ +static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, + int subsys_id) +{ + return cgrp->subsys[subsys_id]; +} + /* convenient tests for these bits */ static inline bool cgroup_is_dead(const struct cgroup *cgrp) { -- cgit v1.3-8-gc7d7 From bd8815a6d802fc16a7a106e170593aa05dc17e72 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Aug 2013 20:11:27 -0400 Subject: cgroup: make css_for_each_descendant() and friends include the origin css in the iteration Previously, all css descendant iterators didn't include the origin (root of subtree) css in the iteration. The reasons were maintaining consistency with css_for_each_child() and that at the time of introduction more use cases needed skipping the origin anyway; however, given that css_is_descendant() considers self to be a descendant, omitting the origin css has become more confusing and looking at the accumulated use cases rather clearly indicates that including origin would result in simpler code overall. While this is a change which can easily lead to subtle bugs, cgroup API including the iterators has recently gone through major restructuring and no out-of-tree changes will be applicable without adjustments making this a relatively acceptable opportunity for this type of change. The conversions are mostly straight-forward. If the iteration block had explicit origin handling before or after, it's moved inside the iteration. If not, if (pos == origin) continue; is added. Some conversions add extra reference get/put around origin handling by consolidating origin handling and the rest. While the extra ref operations aren't strictly necessary, this shouldn't cause any noticeable difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Vivek Goyal Acked-by: Aristeu Rozanski Acked-by: Michal Hocko Cc: Jens Axboe Cc: Matt Helsley Cc: Johannes Weiner Cc: Balbir Singh --- block/blk-cgroup.c | 8 ++------ block/blk-cgroup.h | 4 +++- block/blk-throttle.c | 3 --- include/linux/cgroup.h | 17 +++++++++-------- kernel/cgroup.c | 29 +++++++++++------------------ kernel/cgroup_freezer.c | 29 ++++++++++++++++------------- kernel/cpuset.c | 42 ++++++++++++++++++++++++++---------------- mm/memcontrol.c | 9 +-------- security/device_cgroup.c | 2 +- 9 files changed, 69 insertions(+), 74 deletions(-) (limited to 'kernel') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 54ad00292edf..e90c7c164c83 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -615,12 +615,10 @@ u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off) struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - u64 sum; + u64 sum = 0; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_stat_read((void *)pd + off); - rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); @@ -650,13 +648,11 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd, struct blkcg_policy *pol = blkcg_policy[pd->plid]; struct blkcg_gq *pos_blkg; struct cgroup_subsys_state *pos_css; - struct blkg_rwstat sum; + struct blkg_rwstat sum = { }; int i; lockdep_assert_held(pd->blkg->q->queue_lock); - sum = blkg_rwstat_read((void *)pd + off); - rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, pd_to_blkg(pd)) { struct blkg_policy_data *pos_pd = blkg_to_pd(pos_blkg, pol); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 855538630300..ae6969a7ffd4 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -291,6 +291,7 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, * read locked. If called under either blkcg or queue lock, the iteration * is guaranteed to include all and only online blkgs. The caller may * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. */ #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ @@ -304,7 +305,8 @@ struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q, * @p_blkg: target blkg to walk descendants of * * Similar to blkg_for_each_descendant_pre() but performs post-order - * traversal instead. Synchronization rules are the same. + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. */ #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 8cefa7f8590e..8331aba9426f 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1379,7 +1379,6 @@ static int tg_set_conf(struct cgroup_subsys_state *css, struct cftype *cft, * restrictions in the whole hierarchy and allows them to bypass * blk-throttle. */ - tg_update_has_rules(tg); blkg_for_each_descendant_pre(blkg, pos_css, ctx.blkg) tg_update_has_rules(blkg_to_tg(blkg)); @@ -1639,8 +1638,6 @@ void blk_throtl_drain(struct request_queue *q) blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) tg_drain_bios(&blkg_to_tg(blkg)->service_queue); - tg_drain_bios(&td_root_tg(td)->service_queue); - /* finally, transfer bios from top-level tg's into the td */ tg_drain_bios(&td->service_queue); diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c40e508d54e9..8ec5b0f38292 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -798,7 +798,8 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos); * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * - * Walk @root's descendants. Must be called under rcu_read_lock(). A + * Walk @root's descendants. @root is included in the iteration and the + * first node to be visited. Must be called under rcu_read_lock(). A * descendant css which hasn't finished ->css_online() or already has * finished ->css_offline() may show up during traversal and it's each * subsystem's responsibility to verify that each @pos is alive. @@ -820,13 +821,12 @@ css_rightmost_descendant(struct cgroup_subsys_state *pos); * * my_update_state(@css) * { - * Lock @css; - * Update @css's state; - * Unlock @css; - * * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; - * Verify @pos is alive and inherit state from @pos's parent; + * if (@pos == @css) + * Update @css's state; + * else + * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } @@ -864,8 +864,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order - * traversal instead. Note that the walk visibility guarantee described in - * pre-order walk doesn't apply the same to post-order walks. + * traversal instead. @root is included in the iteration and the last + * node to be visited. Note that the walk visibility guarantee described + * in pre-order walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c02a288a4e3d..52f0498db946 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2868,17 +2868,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) mutex_unlock(&cgroup_mutex); - /* @root always needs to be updated */ - inode = root->dentry->d_inode; - mutex_lock(&inode->i_mutex); - mutex_lock(&cgroup_mutex); - ret = cgroup_addrm_files(root, cfts, is_add); - mutex_unlock(&cgroup_mutex); - mutex_unlock(&inode->i_mutex); - - if (ret) - goto out_deact; - /* add/rm files for all cgroups created before */ rcu_read_lock(); css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { @@ -2907,7 +2896,6 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) } rcu_read_unlock(); dput(prev); -out_deact: deactivate_super(sb); return ret; } @@ -3099,7 +3087,8 @@ EXPORT_SYMBOL_GPL(css_next_child); * @root: css whose descendants to walk * * To be used by css_for_each_descendant_pre(). Find the next descendant - * to visit for pre-order traversal of @root's descendants. + * to visit for pre-order traversal of @root's descendants. @root is + * included in the iteration and the first node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3114,9 +3103,9 @@ css_next_descendant_pre(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, pretend we just visited @root */ + /* if first iteration, visit @root */ if (!pos) - pos = root; + return root; /* visit the first child if exists */ next = css_next_child(NULL, pos); @@ -3186,7 +3175,8 @@ css_leftmost_descendant(struct cgroup_subsys_state *pos) * @root: css whose descendants to walk * * To be used by css_for_each_descendant_post(). Find the next descendant - * to visit for post-order traversal of @root's descendants. + * to visit for post-order traversal of @root's descendants. @root is + * included in the iteration and the last node to be visited. * * While this function requires RCU read locking, it doesn't require the * whole traversal to be contained in a single RCU critical section. This @@ -3207,14 +3197,17 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, return next != root ? next : NULL; } + /* if we visited @root, we're done */ + if (pos == root) + return NULL; + /* if there's an unvisited sibling, visit its leftmost descendant */ next = css_next_child(pos, css_parent(pos)); if (next) return css_leftmost_descendant(next); /* no sibling left, visit parent */ - next = css_parent(pos); - return next != root ? next : NULL; + return css_parent(pos); } EXPORT_SYMBOL_GPL(css_next_descendant_post); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 224da9aa27f5..f0ff64d0ebaa 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -311,7 +311,6 @@ static int freezer_read(struct cgroup_subsys_state *css, struct cftype *cft, /* update states bottom-up */ css_for_each_descendant_post(pos, css) update_if_frozen(pos); - update_if_frozen(css); rcu_read_unlock(); @@ -391,11 +390,6 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) { struct cgroup_subsys_state *pos; - /* update @freezer */ - spin_lock_irq(&freezer->lock); - freezer_apply_state(freezer, freeze, CGROUP_FREEZING_SELF); - spin_unlock_irq(&freezer->lock); - /* * Update all its descendants in pre-order traversal. Each * descendant will try to inherit its parent's FREEZING state as @@ -406,14 +400,23 @@ static void freezer_change_state(struct freezer *freezer, bool freeze) struct freezer *pos_f = css_freezer(pos); struct freezer *parent = parent_freezer(pos_f); - /* - * Our update to @parent->state is already visible which is - * all we need. No need to lock @parent. For more info on - * synchronization, see freezer_post_create(). - */ spin_lock_irq(&pos_f->lock); - freezer_apply_state(pos_f, parent->state & CGROUP_FREEZING, - CGROUP_FREEZING_PARENT); + + if (pos_f == freezer) { + freezer_apply_state(pos_f, freeze, + CGROUP_FREEZING_SELF); + } else { + /* + * Our update to @parent->state is already visible + * which is all we need. No need to lock @parent. + * For more info on synchronization, see + * freezer_post_create(). + */ + freezer_apply_state(pos_f, + parent->state & CGROUP_FREEZING, + CGROUP_FREEZING_PARENT); + } + spin_unlock_irq(&pos_f->lock); } rcu_read_unlock(); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index bf69717325b4..72a0383f382f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -222,7 +222,8 @@ static struct cpuset top_cpuset = { * * Walk @des_cs through the online descendants of @root_cs. Must be used * with RCU read locked. The caller may modify @pos_css by calling - * css_rightmost_descendant() to skip subtree. + * css_rightmost_descendant() to skip subtree. @root_cs is included in the + * iteration and the first node to be visited. */ #define cpuset_for_each_descendant_pre(des_cs, pos_css, root_cs) \ css_for_each_descendant_pre((pos_css), &(root_cs)->css) \ @@ -506,6 +507,9 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { + if (cp == root_cs) + continue; + /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); @@ -613,6 +617,8 @@ static int generate_sched_domains(cpumask_var_t **domains, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, &top_cpuset) { + if (cp == &top_cpuset) + continue; /* * Continue traversing beyond @cp iff @cp has some CPUs and * isn't load balancing. The former is obvious. The @@ -875,15 +881,17 @@ static void update_tasks_cpumask_hier(struct cpuset *root_cs, struct cpuset *cp; struct cgroup_subsys_state *pos_css; - if (update_root) - update_tasks_cpumask(root_cs, heap); - rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!cpumask_empty(cp->cpus_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!cpumask_empty(cp->cpus_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -1130,15 +1138,17 @@ static void update_tasks_nodemask_hier(struct cpuset *root_cs, struct cpuset *cp; struct cgroup_subsys_state *pos_css; - if (update_root) - update_tasks_nodemask(root_cs, heap); - rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - /* skip the whole subtree if @cp have some CPU */ - if (!nodes_empty(cp->mems_allowed)) { - pos_css = css_rightmost_descendant(pos_css); - continue; + if (cp == root_cs) { + if (!update_root) + continue; + } else { + /* skip the whole subtree if @cp have some CPU */ + if (!nodes_empty(cp->mems_allowed)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } } if (!css_tryget(&cp->css)) continue; @@ -2237,7 +2247,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rcu_read_lock(); cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { - if (!css_tryget(&cs->css)) + if (cs == &top_cpuset || !css_tryget(&cs->css)) continue; rcu_read_unlock(); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2885e3e85047..b89d4cbc0c08 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1079,14 +1079,7 @@ static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, { struct cgroup_subsys_state *prev_css, *next_css; - /* - * Root is not visited by cgroup iterators so it needs an - * explicit visit. - */ - if (!last_visited) - return root; - - prev_css = (last_visited == root) ? NULL : &last_visited->css; + prev_css = last_visited ? &last_visited->css : NULL; skip_node: next_css = css_next_descendant_pre(prev_css, &root->css); diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 9bf230aa28b0..c123628d3f84 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -456,7 +456,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root, * methods), and online ones are safe to access outside RCU * read lock without bumping refcnt. */ - if (!is_devcg_online(devcg)) + if (pos == &devcg_root->css || !is_devcg_online(devcg)) continue; rcu_read_unlock(); -- cgit v1.3-8-gc7d7 From 851cf6e7d6366195d4ee033cdc7787df1a649a14 Mon Sep 17 00:00:00 2001 From: Andrew Jones Date: Fri, 9 Aug 2013 19:51:57 +0530 Subject: jump_label: Split jumplabel ratelimit Commit b202952075f62603bea9bfb6ebc6b0420db11949 ("perf, core: Rate limit perf_sched_events jump_label patching") introduced rate limiting for jump label disabling. The changes were made in the jump label code in order to be more widely available and to keep things tidier. This is all fine, except now jump_label.h includes linux/workqueue.h, which makes it impossible to include jump_label.h from anything that workqueue.h needs. For example, it's now impossible to include jump_label.h from asm/spinlock.h, which is done in proposed pv-ticketlock patches. This patch splits out the rate limiting related changes from jump_label.h into a new file, jump_label_ratelimit.h, to resolve the issue. Signed-off-by: Andrew Jones Link: http://lkml.kernel.org/r/1376058122-8248-10-git-send-email-raghavendra.kt@linux.vnet.ibm.com Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: Raghavendra K T Acked-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- include/linux/jump_label.h | 28 +--------------------------- include/linux/jump_label_ratelimit.h | 34 ++++++++++++++++++++++++++++++++++ include/linux/perf_event.h | 1 + kernel/jump_label.c | 1 + 4 files changed, 37 insertions(+), 27 deletions(-) create mode 100644 include/linux/jump_label_ratelimit.h (limited to 'kernel') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 0976fc46d1e0..a5079072da66 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -48,7 +48,6 @@ #include #include -#include #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) @@ -61,12 +60,6 @@ struct static_key { #endif }; -struct static_key_deferred { - struct static_key key; - unsigned long timeout; - struct delayed_work work; -}; - # include # define HAVE_JUMP_LABEL #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ @@ -78,6 +71,7 @@ enum jump_label_type { struct module; +#include #ifdef HAVE_JUMP_LABEL #define JUMP_LABEL_TRUE_BRANCH 1UL @@ -119,10 +113,7 @@ extern void arch_jump_label_transform_static(struct jump_entry *entry, extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); -extern void static_key_slow_dec_deferred(struct static_key_deferred *key); extern void jump_label_apply_nops(struct module *mod); -extern void -jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); #define STATIC_KEY_INIT_TRUE ((struct static_key) \ { .enabled = ATOMIC_INIT(1), .entries = (void *)1 }) @@ -131,8 +122,6 @@ jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); #else /* !HAVE_JUMP_LABEL */ -#include - struct static_key { atomic_t enabled; }; @@ -141,10 +130,6 @@ static __always_inline void jump_label_init(void) { } -struct static_key_deferred { - struct static_key key; -}; - static __always_inline bool static_key_false(struct static_key *key) { if (unlikely(atomic_read(&key->enabled)) > 0) @@ -169,11 +154,6 @@ static inline void static_key_slow_dec(struct static_key *key) atomic_dec(&key->enabled); } -static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) -{ - static_key_slow_dec(&key->key); -} - static inline int jump_label_text_reserved(void *start, void *end) { return 0; @@ -187,12 +167,6 @@ static inline int jump_label_apply_nops(struct module *mod) return 0; } -static inline void -jump_label_rate_limit(struct static_key_deferred *key, - unsigned long rl) -{ -} - #define STATIC_KEY_INIT_TRUE ((struct static_key) \ { .enabled = ATOMIC_INIT(1) }) #define STATIC_KEY_INIT_FALSE ((struct static_key) \ diff --git a/include/linux/jump_label_ratelimit.h b/include/linux/jump_label_ratelimit.h new file mode 100644 index 000000000000..113788389b3d --- /dev/null +++ b/include/linux/jump_label_ratelimit.h @@ -0,0 +1,34 @@ +#ifndef _LINUX_JUMP_LABEL_RATELIMIT_H +#define _LINUX_JUMP_LABEL_RATELIMIT_H + +#include +#include + +#if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_JUMP_LABEL) +struct static_key_deferred { + struct static_key key; + unsigned long timeout; + struct delayed_work work; +}; +#endif + +#ifdef HAVE_JUMP_LABEL +extern void static_key_slow_dec_deferred(struct static_key_deferred *key); +extern void +jump_label_rate_limit(struct static_key_deferred *key, unsigned long rl); + +#else /* !HAVE_JUMP_LABEL */ +struct static_key_deferred { + struct static_key key; +}; +static inline void static_key_slow_dec_deferred(struct static_key_deferred *key) +{ + static_key_slow_dec(&key->key); +} +static inline void +jump_label_rate_limit(struct static_key_deferred *key, + unsigned long rl) +{ +} +#endif /* HAVE_JUMP_LABEL */ +#endif /* _LINUX_JUMP_LABEL_RATELIMIT_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c43f6eabad5b..226be8da3f85 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -48,6 +48,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include #include #include diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 60f48fa0fd0d..297a9247a3b3 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef HAVE_JUMP_LABEL -- cgit v1.3-8-gc7d7 From fbb00b568bc93073452d2a0f9f06e7c33d16eece Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 19 Jun 2013 23:56:22 +0200 Subject: sched: Consolidate open coded preemptible() checks preempt_schedule() and preempt_schedule_context() open code their preemptability checks. Use the standard API instead for consolidation. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Li Zhong Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Borislav Petkov Cc: Alex Shi Cc: Paul Turner Cc: Mike Galbraith Cc: Vincent Guittot --- kernel/context_tracking.c | 3 +-- kernel/sched/core.c | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 383f8231e436..942835c12ae5 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -87,10 +87,9 @@ void user_enter(void) */ void __sched notrace preempt_schedule_context(void) { - struct thread_info *ti = current_thread_info(); enum ctx_state prev_ctx; - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(!preemptible())) return; /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7c32cb7bfeb..3fb7acee7326 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2510,13 +2510,11 @@ void __sched schedule_preempt_disabled(void) */ asmlinkage void __sched notrace preempt_schedule(void) { - struct thread_info *ti = current_thread_info(); - /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ - if (likely(ti->preempt_count || irqs_disabled())) + if (likely(!preemptible())) return; do { -- cgit v1.3-8-gc7d7 From 2d854e5738cded368a0759f85b1197f5c044513d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jul 2013 19:02:30 +0200 Subject: context_tracing: Fix guest accounting with native vtime 1) If context tracking is enabled with native vtime accounting (which combo is useless except for dev testing), we call vtime_guest_enter() and vtime_guest_exit() on host <-> guest switches. But those are stubs in this configurations. As a result, cputime is not correctly flushed on kvm context switches. 2) If context tracking runs but is disabled on some CPUs, those CPUs end up calling __guest_enter/__guest_exit which in turn call vtime_account_system(). We don't want to call this because we run in tick based accounting for these CPUs. Refactor the guest_enter/guest_exit code such that all combinations finally work. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 52 +++++++++++++++++----------------------- kernel/context_tracking.c | 6 +++-- 2 files changed, 26 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index fc09d7b0dacf..5984f2556d13 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -20,25 +20,6 @@ struct context_tracking { } state; }; -static inline void __guest_enter(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags |= PF_VCPU; -} - -static inline void __guest_exit(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags &= ~PF_VCPU; -} #ifdef CONFIG_CONTEXT_TRACKING DECLARE_PER_CPU(struct context_tracking, context_tracking); @@ -56,9 +37,6 @@ static inline bool context_tracking_active(void) extern void user_enter(void); extern void user_exit(void); -extern void guest_enter(void); -extern void guest_exit(void); - static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; @@ -81,21 +59,35 @@ extern void context_tracking_task_switch(struct task_struct *prev, static inline bool context_tracking_in_user(void) { return false; } static inline void user_enter(void) { } static inline void user_exit(void) { } +static inline enum ctx_state exception_enter(void) { return 0; } +static inline void exception_exit(enum ctx_state prev_ctx) { } +static inline void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) { } +#endif /* !CONFIG_CONTEXT_TRACKING */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern void guest_enter(void); +extern void guest_exit(void); +#else static inline void guest_enter(void) { - __guest_enter(); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); + current->flags |= PF_VCPU; } static inline void guest_exit(void) { - __guest_exit(); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); + current->flags &= ~PF_VCPU; } - -static inline enum ctx_state exception_enter(void) { return 0; } -static inline void exception_exit(enum ctx_state prev_ctx) { } -static inline void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) { } -#endif /* !CONFIG_CONTEXT_TRACKING */ +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ #endif diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 942835c12ae5..1f47119c5b09 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -141,12 +141,13 @@ void user_exit(void) local_irq_restore(flags); } +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN void guest_enter(void) { if (vtime_accounting_enabled()) vtime_guest_enter(current); else - __guest_enter(); + current->flags |= PF_VCPU; } EXPORT_SYMBOL_GPL(guest_enter); @@ -155,9 +156,10 @@ void guest_exit(void) if (vtime_accounting_enabled()) vtime_guest_exit(current); else - __guest_exit(); + current->flags &= ~PF_VCPU; } EXPORT_SYMBOL_GPL(guest_exit); +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ /** -- cgit v1.3-8-gc7d7 From 5b206d48e58204e84d249c4eb18651a1ff7a1274 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jul 2013 19:05:14 +0200 Subject: vtime: Update a few comments Update a stale comment from the old vtime era and document some locking that might be non obvious. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 10 ++++------ kernel/sched/cputime.c | 7 +++++++ 2 files changed, 11 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 5984f2556d13..d883ff0dd8fc 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -72,8 +72,9 @@ extern void guest_exit(void); static inline void guest_enter(void) { /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. + * This is running in ioctl context so its safe + * to assume that it's the stime pending cputime + * to flush. */ vtime_account_system(current); current->flags |= PF_VCPU; @@ -81,10 +82,7 @@ static inline void guest_enter(void) static inline void guest_exit(void) { - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ + /* Flush the guest cputime we spent on the guest */ vtime_account_system(current); current->flags &= ~PF_VCPU; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..223a35efa0a6 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -712,6 +712,13 @@ void vtime_user_enter(struct task_struct *tsk) void vtime_guest_enter(struct task_struct *tsk) { + /* + * The flags must be updated under the lock with + * the vtime_snap flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags |= PF_VCPU; -- cgit v1.3-8-gc7d7 From d65ec12127a5b6c6d7f5331c78157dab98a20ff0 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Jul 2013 23:59:33 +0200 Subject: context_tracking: Fix runtime CPU off-case As long as the context tracking is enabled on any CPU, even a single one, all other CPUs need to keep track of their user <-> kernel boundaries cross as well. This is because a task can sleep while servicing an exception that happened in the kernel or in userspace. Then when the task eventually wakes up and return from the exception, the CPU needs to know if we resume in userspace or in the kernel. exception_exit() get this information from exception_enter() that saved the previous state. If the CPU where the exception happened didn't keep track of these informations, exception_exit() doesn't know which state tracking to restore on the CPU where the task got migrated and we may return to userspace with the context tracking subsystem thinking that we are in kernel mode. This can be fixed in the long term if we move our context tracking probes on very low level arch fast path user <-> kernel boundary, although even that is worrisome as an exception can still happen in the few instructions between the probe and the actual iret. Also we are not yet ready to set these probes in the fast path given the potential overhead problem it induces. So let's fix this by always enable context tracking even on CPUs that are not in the full dynticks range. OTOH we can spare the rcu_user_*() and vtime_user_*() calls there because the tick runs on these CPUs and we can handle RCU state machine and cputime accounting through it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/context_tracking.c | 52 ++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 1f47119c5b09..7b095de356c5 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -54,17 +54,31 @@ void user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if (__this_cpu_read(context_tracking.active) && - __this_cpu_read(context_tracking.state) != IN_USER) { + if ( __this_cpu_read(context_tracking.state) != IN_USER) { + if (__this_cpu_read(context_tracking.active)) { + /* + * At this stage, only low level arch entry code remains and + * then we'll run in userspace. We can assume there won't be + * any RCU read-side critical section until the next call to + * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency + * on the tick. + */ + vtime_user_enter(current); + rcu_user_enter(); + } /* - * At this stage, only low level arch entry code remains and - * then we'll run in userspace. We can assume there won't be - * any RCU read-side critical section until the next call to - * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency - * on the tick. + * Even if context tracking is disabled on this CPU, because it's outside + * the full dynticks mask for example, we still have to keep track of the + * context transitions and states to prevent inconsistency on those of + * other CPUs. + * If a task triggers an exception in userspace, sleep on the exception + * handler and then migrate to another CPU, that new CPU must know where + * the exception returns by the time we call exception_exit(). + * This information can only be provided by the previous CPU when it called + * exception_enter(). + * OTOH we can spare the calls to vtime and RCU when context_tracking.active + * is false because we know that CPU is not tickless. */ - vtime_user_enter(current); - rcu_user_enter(); __this_cpu_write(context_tracking.state, IN_USER); } local_irq_restore(flags); @@ -130,12 +144,14 @@ void user_exit(void) local_irq_save(flags); if (__this_cpu_read(context_tracking.state) == IN_USER) { - /* - * We are going to run code that may use RCU. Inform - * RCU core about that (ie: we may need the tick again). - */ - rcu_user_exit(); - vtime_user_exit(current); + if (__this_cpu_read(context_tracking.active)) { + /* + * We are going to run code that may use RCU. Inform + * RCU core about that (ie: we may need the tick again). + */ + rcu_user_exit(); + vtime_user_exit(current); + } __this_cpu_write(context_tracking.state, IN_KERNEL); } local_irq_restore(flags); @@ -178,8 +194,6 @@ EXPORT_SYMBOL_GPL(guest_exit); void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { - if (__this_cpu_read(context_tracking.active)) { - clear_tsk_thread_flag(prev, TIF_NOHZ); - set_tsk_thread_flag(next, TIF_NOHZ); - } + clear_tsk_thread_flag(prev, TIF_NOHZ); + set_tsk_thread_flag(next, TIF_NOHZ); } -- cgit v1.3-8-gc7d7 From 2e70933866ace52091a3c11a5c104c063ab0c445 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Jul 2013 00:55:25 +0200 Subject: nohz: Only enable context tracking on full dynticks CPUs The context tracking subsystem has the ability to selectively enable the tracking on any defined subset of CPU. This means that we can define a CPU range that doesn't run the context tracking and another range that does. Now what we want in practice is to enable the tracking on full dynticks CPUs only. In order to perform this, we just need to pass our full dynticks CPU range selection from the full dynticks subsystem to the context tracking. This way we can spare the overhead of RCU user extended quiescent state and vtime maintainance on the CPUs that are outside the full dynticks range. Just keep in mind the raw context tracking itself is still necessary everywhere. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 2 ++ kernel/context_tracking.c | 5 +++++ kernel/time/tick-sched.c | 6 ++++++ 3 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d883ff0dd8fc..1ae37c708c63 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -34,6 +34,8 @@ static inline bool context_tracking_active(void) return __this_cpu_read(context_tracking.active); } +extern void context_tracking_cpu_set(int cpu); + extern void user_enter(void); extern void user_exit(void); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 7b095de356c5..72bcb2570d3e 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -26,6 +26,11 @@ DEFINE_PER_CPU(struct context_tracking, context_tracking) = { #endif }; +void context_tracking_cpu_set(int cpu) +{ + per_cpu(context_tracking.active, cpu) = true; +} + /** * user_enter - Inform the context tracking that the CPU is going to * enter userspace mode. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9563c744dad2..91a2528b5f44 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -344,11 +345,16 @@ static int tick_nohz_init_all(void) void __init tick_nohz_init(void) { + int cpu; + if (!have_nohz_full_mask) { if (tick_nohz_init_all() < 0) return; } + for_each_cpu(cpu, nohz_full_mask) + context_tracking_cpu_set(cpu); + cpu_notifier(tick_nohz_cpu_down_callback, 0); cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); -- cgit v1.3-8-gc7d7 From d84d27a491880b9902b45c09be8d9e9464fb9b74 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Jul 2013 21:59:29 +0200 Subject: context_tracking: Remove full dynticks' hacky dependency on wide context tracking Now that the full dynticks subsystem only enables the context tracking on full dynticks CPUs, lets remove the dependency on CONTEXT_TRACKING_FORCE This dependency was a hack to enable the context tracking widely for the full dynticks susbsystem until the latter becomes able to enable it in a more CPU-finegrained fashion. Now CONTEXT_TRACKING_FORCE only stands for testing on archs that work on support for the context tracking while full dynticks can't be used yet due to unmet dependencies. It simulates a system where all CPUs are full dynticks so that RCU user extended quiescent states and dynticks cputime accounting can be tested on the given arch. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- init/Kconfig | 28 ++++++++++++++++++++++------ kernel/time/Kconfig | 1 - 2 files changed, 22 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index 247084be0590..ffbf5d788bf3 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -527,13 +527,29 @@ config RCU_USER_QS config CONTEXT_TRACKING_FORCE bool "Force context tracking" depends on CONTEXT_TRACKING - default CONTEXT_TRACKING + default y if !NO_HZ_FULL help - Probe on user/kernel boundaries by default in order to - test the features that rely on it such as userspace RCU extended - quiescent states. - This test is there for debugging until we have a real user like the - full dynticks mode. + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also + other dependencies to provide in order to make the full + dynticks working. + + This option stands for testing when an arch implements the + context tracking backend but doesn't yet fullfill all the + requirements to make the full dynticks feature working. + Without the full dynticks, there is no way to test the support + for context tracking and the subsystems that rely on it: RCU + userspace extended quiescent state and tickless cputime + accounting. This option copes with the absence of the full + dynticks subsystem by forcing the context tracking on all + CPUs in the system. + + Say Y only if you're working on the developpement of an + architecture backend for the context tracking. + + Say N otherwise, this option brings an overhead that you + don't want in production. + config RCU_FANOUT int "Tree-based hierarchical RCU fanout value" diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..747bbc70f53b 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -105,7 +105,6 @@ config NO_HZ_FULL select RCU_USER_QS select RCU_NOCB_CPU select VIRT_CPU_ACCOUNTING_GEN - select CONTEXT_TRACKING_FORCE select IRQ_WORK help Adaptively try to shutdown the tick whenever possible, even when -- cgit v1.3-8-gc7d7 From b9d10be7a8e88fdcb12540387c219cdde87b0795 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 12 Aug 2013 09:45:53 -0600 Subject: ACPI / processor: Acquire writer lock to update CPU maps CPU system maps are protected with reader/writer locks. The reader lock, get_online_cpus(), assures that the maps are not updated while holding the lock. The writer lock, cpu_hotplug_begin(), is used to udpate the cpu maps along with cpu_maps_update_begin(). However, the ACPI processor handler updates the cpu maps without holding the the writer lock. acpi_map_lsapic() is called from acpi_processor_hotadd_init() to update cpu_possible_mask and cpu_present_mask. acpi_unmap_lsapic() is called from acpi_processor_remove() to update cpu_possible_mask. Currently, they are either unprotected or protected with the reader lock, which is not correct. For example, the get_online_cpus() below is supposed to assure that cpu_possible_mask is not changed while the code is iterating with for_each_possible_cpu(). get_online_cpus(); for_each_possible_cpu(cpu) { : } put_online_cpus(); However, this lock has no protection with CPU hotplug since the ACPI processor handler does not use the writer lock when it updates cpu_possible_mask. The reader lock does not serialize within the readers. This patch protects them with the writer lock with cpu_hotplug_begin() along with cpu_maps_update_begin(), which must be held before calling cpu_hotplug_begin(). It also protects arch_register_cpu() / arch_unregister_cpu(), which creates / deletes a sysfs cpu device interface. For this purpose it changes cpu_hotplug_begin() and cpu_hotplug_done() to global and exports them in cpu.h. Signed-off-by: Toshi Kani Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_processor.c | 21 ++++++++++++++++----- include/linux/cpu.h | 4 ++++ kernel/cpu.c | 9 +++------ 3 files changed, 23 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/drivers/acpi/acpi_processor.c b/drivers/acpi/acpi_processor.c index 5a74a9c1e42c..f29e06efa479 100644 --- a/drivers/acpi/acpi_processor.c +++ b/drivers/acpi/acpi_processor.c @@ -178,14 +178,17 @@ static int acpi_processor_hotadd_init(struct acpi_processor *pr) if (ACPI_FAILURE(status) || !(sta & ACPI_STA_DEVICE_PRESENT)) return -ENODEV; + cpu_maps_update_begin(); + cpu_hotplug_begin(); + ret = acpi_map_lsapic(pr->handle, &pr->id); if (ret) - return ret; + goto out; ret = arch_register_cpu(pr->id); if (ret) { acpi_unmap_lsapic(pr->id); - return ret; + goto out; } /* @@ -195,7 +198,11 @@ static int acpi_processor_hotadd_init(struct acpi_processor *pr) */ pr_info("CPU%d has been hot-added\n", pr->id); pr->flags.need_hotplug_init = 1; - return 0; + +out: + cpu_hotplug_done(); + cpu_maps_update_done(); + return ret; } #else static inline int acpi_processor_hotadd_init(struct acpi_processor *pr) @@ -452,11 +459,15 @@ static void acpi_processor_remove(struct acpi_device *device) per_cpu(processor_device_array, pr->id) = NULL; per_cpu(processors, pr->id) = NULL; + cpu_maps_update_begin(); + cpu_hotplug_begin(); + /* Remove the CPU. */ - get_online_cpus(); arch_unregister_cpu(pr->id); acpi_unmap_lsapic(pr->id); - put_online_cpus(); + + cpu_hotplug_done(); + cpu_maps_update_done(); try_offline_node(cpu_to_node(pr->id)); diff --git a/include/linux/cpu.h b/include/linux/cpu.h index ab0eade73039..956c0a16566f 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -172,6 +172,8 @@ extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU /* Stop CPUs going up and down. */ +extern void cpu_hotplug_begin(void); +extern void cpu_hotplug_done(void); extern void get_online_cpus(void); extern void put_online_cpus(void); extern void cpu_hotplug_disable(void); @@ -197,6 +199,8 @@ static inline void cpu_hotplug_driver_unlock(void) #else /* CONFIG_HOTPLUG_CPU */ +static inline void cpu_hotplug_begin(void) {} +static inline void cpu_hotplug_done(void) {} #define get_online_cpus() do { } while (0) #define put_online_cpus() do { } while (0) #define cpu_hotplug_disable() do { } while (0) diff --git a/kernel/cpu.c b/kernel/cpu.c index b2b227b82123..d7f07a2da5a6 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -113,7 +113,7 @@ EXPORT_SYMBOL_GPL(put_online_cpus); * get_online_cpus() not an api which is called all that often. * */ -static void cpu_hotplug_begin(void) +void cpu_hotplug_begin(void) { cpu_hotplug.active_writer = current; @@ -127,7 +127,7 @@ static void cpu_hotplug_begin(void) } } -static void cpu_hotplug_done(void) +void cpu_hotplug_done(void) { cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); @@ -154,10 +154,7 @@ void cpu_hotplug_enable(void) cpu_maps_update_done(); } -#else /* #if CONFIG_HOTPLUG_CPU */ -static void cpu_hotplug_begin(void) {} -static void cpu_hotplug_done(void) {} -#endif /* #else #if CONFIG_HOTPLUG_CPU */ +#endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) -- cgit v1.3-8-gc7d7 From 40e93b39cd5b6a347333a95152ce37deef37bbd0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:53 -0400 Subject: cgroup: always use cgroup_css() cgroup_css() is the accessor for cgroup->subsys[] but is not used consistently. cgroup->subsys[] will become RCU protected and cgroup_css() will grow synchronization sanity checks. In preparation, make all cgroup->subsys[] dereferences use cgroup_css() consistently. This patch doesn't introduce any functional difference. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 58 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 52f0498db946..49ad96ee08e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -574,7 +574,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgrp->subsys[i]; + template[i] = cgroup_css(cgrp, i); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -871,7 +871,7 @@ static void cgroup_free_fn(struct work_struct *work) * Release the subsystem state objects. */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); ss->css_free(css); } @@ -1067,27 +1067,27 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgrp->subsys[i]); - BUG_ON(!cgroup_dummy_top->subsys[i]); - BUG_ON(cgroup_dummy_top->subsys[i]->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, i)); + BUG_ON(!cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; - cgrp->subsys[i]->cgroup = cgrp; + cgroup_css(cgrp, i)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgrp->subsys[i]); + ss->bind(cgroup_css(cgrp, i)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgrp->subsys[i] != cgroup_dummy_top->subsys[i]); - BUG_ON(cgrp->subsys[i]->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); + BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_dummy_top->subsys[i]); - cgroup_dummy_top->subsys[i]->cgroup = cgroup_dummy_top; + ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; cgrp->subsys[i] = NULL; cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -2072,7 +2072,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2114,7 +2114,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss->attach) ss->attach(css, &tset); @@ -2136,7 +2136,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (ss == failed_ss) break; @@ -2308,7 +2308,7 @@ static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); if (cft->ss) - return cgrp->subsys[cft->ss->subsys_id]; + return cgroup_css(cgrp, cft->ss->subsys_id); return &cgrp->dummy_css; } @@ -4241,7 +4241,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4285,7 +4285,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->id = NULL; if (cgrp == cgroup_dummy_top) css->flags |= CSS_ROOT; - BUG_ON(cgrp->subsys[ss->subsys_id]); + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; /* @@ -4300,7 +4300,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, /* invoke ->css_online() on a new CSS and mark it online if successful */ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4315,7 +4315,7 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); lockdep_assert_held(&cgroup_mutex); @@ -4400,7 +4400,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(parent->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4477,7 +4477,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4590,7 +4590,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); /* * Killing would put the base ref, but we need to keep it @@ -4676,7 +4676,7 @@ static void cgroup_offline_fn(struct work_struct *work) * destruction happens only after all css's are released. */ for_each_root_subsys(cgrp->root, ss) - css_put(cgrp->subsys[ss->subsys_id]); + css_put(cgroup_css(cgrp, ss->subsys_id)); /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); @@ -4741,7 +4741,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_cgroup_css(css, ss, cgroup_dummy_top); @@ -4820,7 +4820,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_dummy_top->subsys[ss->subsys_id]); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_dummy_top->subsys[ss->subsys_id]); + ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); cgroup_dummy_top->subsys[ss->subsys_id] = NULL; mutex_unlock(&cgroup_mutex); @@ -5562,8 +5562,8 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; - parent_css = parent->subsys[subsys_id]; - child_css = child->subsys[subsys_id]; + parent_css = cgroup_css(parent, subsys_id); + child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; @@ -5624,7 +5624,7 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) /* get cgroup */ cgrp = __d_cgrp(f->f_dentry); - css = cgrp->subsys[id]; + css = cgroup_css(cgrp, id); return css ? css : ERR_PTR(-ENOENT); } -- cgit v1.3-8-gc7d7 From 35ef10da65d43211f4cd7e7822cbb3becdfc0ae1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: rename cgroup_subsys_state->dput_work and its callback function css (cgroup_subsys_state) will become RCU protected and there will be two stages which require punting to work item during release. To prepare for using the work item for multiple times, rename css->dput_work to css->destroy_work and css_dput_fn() to css_free_work_fn() and move work item initialization from css init to right before the actual usage. This reorganization doesn't introduce any behavior change. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 21 ++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8ec5b0f38292..12d66fee26f8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -80,7 +80,7 @@ struct cgroup_subsys_state { struct css_id __rcu *id; /* Used to put @cgroup->dentry on the last css_put() */ - struct work_struct dput_work; + struct work_struct destroy_work; }; /* bits in struct cgroup_subsys_state flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 49ad96ee08e1..0b280978f097 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4259,10 +4259,10 @@ err: return ret; } -static void css_dput_fn(struct work_struct *work) +static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = - container_of(work, struct cgroup_subsys_state, dput_work); + container_of(work, struct cgroup_subsys_state, destroy_work); cgroup_dput(css->cgroup); } @@ -4272,7 +4272,14 @@ static void css_release(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - schedule_work(&css->dput_work); + /* + * css holds an extra ref to @cgrp->dentry which is put on the last + * css_put(). dput() requires process context, which css_put() may + * be called without. @css->destroy_work will be used to invoke + * dput() asynchronously from css_put(). + */ + INIT_WORK(&css->destroy_work, css_free_work_fn); + schedule_work(&css->destroy_work); } static void init_cgroup_css(struct cgroup_subsys_state *css, @@ -4287,14 +4294,6 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; - - /* - * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->dput_work will be used to invoke - * dput() asynchronously from css_put(). - */ - INIT_WORK(&css->dput_work, css_dput_fn); } /* invoke ->css_online() on a new CSS and mark it online if successful */ -- cgit v1.3-8-gc7d7 From 0ae78e0bf10ac38ab53548e18383afc9997eca22 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: add cgroup_subsys_state->parent With the planned unified hierarchy, css's (cgroup_subsys_state) will be RCU protected and allowed to be attached and detached dynamically over the course of a cgroup's lifetime. This means that css's will stay accessible after being detached from its cgroup - the matching pointer in cgroup->subsys[] cleared - for ref draining and RCU grace period. cgroup core still wants to guarantee that the parent css is never destroyed before its children and css_parent() always returns the parent regardless of the state of the child css as long as it's accessible. This patch makes css's hold onto their parents and adds css->parent so that the parent css is never detroyed before its children and can be determined without consulting the cgroups. cgroup->dummy_css is also updated to point to the parent dummy_css; however, it doesn't need to worry about object lifetime as the parent cgroup is already pinned by the child. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 13 ++++--------- kernel/cgroup.c | 18 +++++++++++++++--- 2 files changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 12d66fee26f8..8a5dc91fbaad 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -75,6 +75,9 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; + /* the parent css */ + struct cgroup_subsys_state *parent; + unsigned long flags; /* ID for this css, if possible */ struct css_id __rcu *id; @@ -666,15 +669,7 @@ struct cgroup_subsys { static inline struct cgroup_subsys_state *css_parent(struct cgroup_subsys_state *css) { - struct cgroup *parent_cgrp = css->cgroup->parent; - - if (!parent_cgrp) - return NULL; - - if (css->ss) - return parent_cgrp->subsys[css->ss->subsys_id]; - else - return &parent_cgrp->dummy_css; + return css->parent; } /** diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 0b280978f097..5c6dd7ed26a7 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4264,6 +4264,9 @@ static void css_free_work_fn(struct work_struct *work) struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + if (css->parent) + css_put(css->parent); + cgroup_dput(css->cgroup); } @@ -4290,8 +4293,12 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->ss = ss; css->flags = 0; css->id = NULL; - if (cgrp == cgroup_dummy_top) + + if (cgrp->parent) + css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + else css->flags |= CSS_ROOT; + BUG_ON(cgroup_css(cgrp, ss->subsys_id)); cgrp->subsys[ss->subsys_id] = css; } @@ -4388,6 +4395,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, cgrp->dentry = dentry; cgrp->parent = parent; + cgrp->dummy_css.parent = &parent->dummy_css; cgrp->root = parent->root; if (notify_on_release(parent)) @@ -4436,9 +4444,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); root->number_of_cgroups++; - /* each css holds a ref to the cgroup's dentry */ - for_each_root_subsys(root, ss) + /* each css holds a ref to the cgroup's dentry and the parent css */ + for_each_root_subsys(root, ss) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + dget(dentry); + percpu_ref_get(&css->parent->refcnt); + } /* hold a ref to the parent's dentry */ dget(parent->dentry); -- cgit v1.3-8-gc7d7 From b77d7b6088377998ebf65eaea5e51008c2d75e94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:54 -0400 Subject: cgroup: cgroup_css_from_dir() now should be called with RCU read locked cgroup->subsys[] will become RCU protected and thus all cgroup_css() usages should either be under RCU read lock or cgroup_mutex. This patch updates cgroup_css_from_dir() which returns the matching cgroup_subsys_state given a directory file and subsys_id so that it requires RCU read lock and updates its sole user perf_cgroup_connect(). Signed-off-by: Tejun Heo Acked-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar --- kernel/cgroup.c | 12 ++++++++++-- kernel/events/core.c | 3 +++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 5c6dd7ed26a7..cbb6314f1836 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5616,8 +5616,14 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/* - * get corresponding css from file open on cgroupfs directory +/** + * cgroup_css_from_dir - get corresponding css from file open on cgroup dir + * @f: directory file of interest + * @id: subsystem id of interest + * + * Must be called under RCU read lock. The caller is responsible for + * pinning the returned css if it needs to be accessed outside the RCU + * critical section. */ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) { @@ -5625,6 +5631,8 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) struct inode *inode; struct cgroup_subsys_state *css; + WARN_ON_ONCE(!rcu_read_lock_held()); + inode = file_inode(f); /* check in cgroup filesystem dir */ if (inode->i_op != &cgroup_dir_inode_operations) diff --git a/kernel/events/core.c b/kernel/events/core.c index c199c4f24910..23261f957713 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -591,6 +591,8 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, if (!f.file) return -EBADF; + rcu_read_lock(); + css = cgroup_css_from_dir(f.file, perf_subsys_id); if (IS_ERR(css)) { ret = PTR_ERR(css); @@ -617,6 +619,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, ret = -EINVAL; } out: + rcu_read_unlock(); fdput(f); return ret; } -- cgit v1.3-8-gc7d7 From 105347ba5da3e87facce2337c50cd5df93cc6bec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses, and cgroup_css() will soon require either holding cgroup_mutex or RCU read lock. This patch updates cgroup_file_open() such that it acquires the associated css under rcu_read_lock(). While cgroup_file_css() usages in other file operations are safe due to the reference from open, cgroup_css() wouldn't know that and will still trigger warnings. It'd be cleanest to store the acquired css in file->prvidate_data for further file operations but that's already used by seqfile. This patch instead adds cfent->css to cache the associated css. Note that while this field is initialized during cfe init, it should only be considered valid while the file is open. This patch doesn't change visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 48 +++++++++++++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cbb6314f1836..d63beffd41e1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -117,6 +117,7 @@ struct cfent { struct list_head node; struct dentry *dentry; struct cftype *type; + struct cgroup_subsys_state *css; /* file xattrs */ struct simple_xattrs xattrs; @@ -2301,17 +2302,6 @@ static int cgroup_sane_behavior_show(struct cgroup_subsys_state *css, return 0; } -/* return the css for the given cgroup file */ -static struct cgroup_subsys_state *cgroup_file_css(struct cfent *cfe) -{ - struct cftype *cft = cfe->type; - struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); - - if (cft->ss) - return cgroup_css(cgrp, cft->ss->subsys_id); - return &cgrp->dummy_css; -} - /* A buffer size big enough for numbers or short strings */ #define CGROUP_LOCAL_BUFFER_SIZE 64 @@ -2388,7 +2378,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->write) return cft->write(css, cft, file, buf, nbytes, ppos); @@ -2430,7 +2420,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf, { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read) return cft->read(css, cft, file, buf, nbytes, ppos); @@ -2456,7 +2446,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg) { struct cfent *cfe = m->private; struct cftype *cft = cfe->type; - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; if (cft->read_map) { struct cgroup_map_cb cb = { @@ -2479,7 +2469,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup *cgrp = __d_cgrp(cfe->dentry->d_parent); + struct cgroup_subsys_state *css; int err; err = generic_file_open(inode, file); @@ -2491,7 +2482,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * unpinned either on open failure or release. This ensures that * @css stays alive for all file operations. */ - if (css->ss && !css_tryget(css)) + rcu_read_lock(); + if (cft->ss) { + css = cgroup_css(cgrp, cft->ss->subsys_id); + if (!css_tryget(css)) + css = NULL; + } else { + css = &cgrp->dummy_css; + } + rcu_read_unlock(); + + /* css should match @cfe->css, see cgroup_add_file() for details */ + if (!css || WARN_ON_ONCE(css != cfe->css)) return -ENODEV; if (cft->read_map || cft->read_seq_string) { @@ -2510,7 +2512,7 @@ static int cgroup_file_release(struct inode *inode, struct file *file) { struct cfent *cfe = __d_cfe(file->f_dentry); struct cftype *cft = __d_cft(file->f_dentry); - struct cgroup_subsys_state *css = cgroup_file_css(cfe); + struct cgroup_subsys_state *css = cfe->css; int ret = 0; if (cft->release) @@ -2772,6 +2774,18 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); + /* + * cfe->css is used by read/write/close to determine the associated + * css. file->private_data would be a better place but that's + * already used by seqfile. Note that open will use the usual + * cgroup_css() and css_tryget() to acquire the css and this + * caching doesn't affect css lifetime management. + */ + if (cft->ss) + cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); + else + cfe->css = &cgrp->dummy_css; + mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit v1.3-8-gc7d7 From 73e80ed8007fc48a6deeb295ba37159fad274bd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: add __rcu modifier to cgroup->subsys[] For the planned unified hierarchy, each css (cgroup_subsys_state) will be RCU protected so that it can be created and destroyed individually while allowing RCU accesses. Previous changes ensured that all cgroup->subsys[] accesses use the cgroup_css() accessor. This patch adds __rcu modifier to cgroup->subsys[], add matching RCU dereference in cgroup_css() and convert all assignments to either rcu_assign_pointer() or RCU_INIT_POINTER(). This change prepares for the actual RCUfication of css's and doesn't introduce any visible behavior change. The conversion is verified with sparse and all accesses are properly RCU annotated. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 19 ++++++++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 8a5dc91fbaad..eb200b5794e7 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -204,7 +204,7 @@ struct cgroup { struct cgroup_name __rcu *name; /* Private pointers for each registered subsystem */ - struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; + struct cgroup_subsys_state __rcu *subsys[CGROUP_SUBSYS_COUNT]; struct cgroupfs_root *root; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d63beffd41e1..c27101622567 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -229,11 +229,16 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], * @subsys_id: the subsystem of interest * * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. + * This function must be called either under cgroup_mutex or + * rcu_read_lock() and the caller is responsible for pinning the returned + * css if it wants to keep accessing it outside the said locks. This + * function may return %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, int subsys_id) { - return cgrp->subsys[subsys_id]; + return rcu_dereference_check(cgrp->subsys[subsys_id], + lockdep_is_held(&cgroup_mutex)); } /* convenient tests for these bits */ @@ -1072,8 +1077,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, BUG_ON(!cgroup_css(cgroup_dummy_top, i)); BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); - cgrp->subsys[i] = cgroup_dummy_top->subsys[i]; + rcu_assign_pointer(cgrp->subsys[i], + cgroup_css(cgroup_dummy_top, i)); cgroup_css(cgrp, i)->cgroup = cgrp; + list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) @@ -1088,8 +1095,10 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(cgroup_css(cgroup_dummy_top, i)); + cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; - cgrp->subsys[i] = NULL; + RCU_INIT_POINTER(cgrp->subsys[i], NULL); + cgroup_subsys[i]->root = &cgroup_dummy_root; list_move(&ss->sibling, &cgroup_dummy_root.subsys_list); @@ -4314,7 +4323,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - cgrp->subsys[ss->subsys_id] = css; + rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4962,7 +4971,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * also takes care of freeing the css_id. */ ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); - cgroup_dummy_top->subsys[ss->subsys_id] = NULL; + RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); } -- cgit v1.3-8-gc7d7 From 623f926b050e12b0f5e3a2f4d11c36e4ddd63541 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 11:01:55 -0400 Subject: cgroup: reorganize css init / exit paths css (cgroup_subsys_state) lifetime management is about to be restructured. In prepartion, make the following mostly trivial changes. * init_cgroup_css() is renamed to init_css() so that it's consistent with other css handling functions. * alloc_css_id(), online_css() and offline_css() updated to take @css instead of cgroups and subsys IDs. This patch doesn't make any functional changes. v2: v1 merged two for_each_root_subsys() loops in cgroup_create() but Li Zefan pointed out that it breaks error path. Dropped. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 50 +++++++++++++++++++++++--------------------------- 1 file changed, 23 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c27101622567..a1ebc445f350 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -838,8 +838,7 @@ static struct backing_dev_info cgroup_backing_dev_info = { .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; -static int alloc_css_id(struct cgroup_subsys *ss, - struct cgroup *parent, struct cgroup *child); +static int alloc_css_id(struct cgroup_subsys_state *child_css); static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) { @@ -4308,9 +4307,8 @@ static void css_release(struct percpu_ref *ref) schedule_work(&css->destroy_work); } -static void init_cgroup_css(struct cgroup_subsys_state *css, - struct cgroup_subsys *ss, - struct cgroup *cgrp) +static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, + struct cgroup *cgrp) { css->cgroup = cgrp; css->ss = ss; @@ -4327,9 +4325,9 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* invoke ->css_online() on a new CSS and mark it online if successful */ -static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static int online_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -4342,9 +4340,9 @@ static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) } /* if the CSS is online, invoke ->css_offline() on it and mark it offline */ -static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) +static void offline_css(struct cgroup_subsys_state *css) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys *ss = css->ss; lockdep_assert_held(&cgroup_mutex); @@ -4442,10 +4440,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, goto err_free_all; } - init_cgroup_css(css, ss, cgrp); + init_css(css, ss, cgrp); if (ss->use_id) { - err = alloc_css_id(ss, parent, cgrp); + err = alloc_css_id(css); if (err) goto err_free_all; } @@ -4480,7 +4478,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - err = online_css(ss, cgrp); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + + err = online_css(css); if (err) goto err_destroy; @@ -4700,7 +4700,7 @@ static void cgroup_offline_fn(struct work_struct *work) * initate destruction. */ for_each_root_subsys(cgrp->root, ss) - offline_css(ss, cgrp); + offline_css(cgroup_css(cgrp, ss->subsys_id)); /* * Put the css refs from cgroup_destroy_locked(). Each css holds @@ -4778,7 +4778,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); - init_cgroup_css(css, ss, cgroup_dummy_top); + init_css(css, ss, cgroup_dummy_top); /* Update the init_css_set to contain a subsys * pointer to this state - since the subsystem is @@ -4793,7 +4793,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(ss, cgroup_dummy_top)); + BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); mutex_unlock(&cgroup_mutex); @@ -4866,8 +4866,8 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) ss->root = &cgroup_dummy_root; /* our new subsystem will be attached to the dummy hierarchy. */ - init_cgroup_css(css, ss, cgroup_dummy_top); - /* init_idr must be after init_cgroup_css because it sets css->id. */ + init_css(css, ss, cgroup_dummy_top); + /* init_idr must be after init_css() because it sets css->id. */ if (ss->use_id) { ret = cgroup_init_idr(ss, css); if (ret) @@ -4897,7 +4897,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(ss, cgroup_dummy_top); + ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ret) goto err_unload; @@ -4936,7 +4936,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(ss, cgroup_dummy_top); + offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5588,20 +5588,16 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, return 0; } -static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, - struct cgroup *child) +static int alloc_css_id(struct cgroup_subsys_state *child_css) { - int subsys_id, i, depth = 0; - struct cgroup_subsys_state *parent_css, *child_css; + struct cgroup_subsys_state *parent_css = css_parent(child_css); struct css_id *child_id, *parent_id; + int i, depth; - subsys_id = ss->subsys_id; - parent_css = cgroup_css(parent, subsys_id); - child_css = cgroup_css(child, subsys_id); parent_id = rcu_dereference_protected(parent_css->id, true); depth = parent_id->depth + 1; - child_id = get_new_cssid(ss, depth); + child_id = get_new_cssid(child_css->ss, depth); if (IS_ERR(child_id)) return PTR_ERR(child_id); -- cgit v1.3-8-gc7d7 From ae7f164a09408bf21ab3c82a9e80a3ff37aa9e3e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: move cgroup->subsys[] assignment to online_css() Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. In preparation, this patch moves cgroup->subsys[] assignment from init_css() to online_css(). As this means that a newly initialized css should be remembered separately and that cgroup_css() returns NULL between init and online, cgroup_create() is updated so that it stores newly created css's in a local array css_ar[] and cgroup_init/load_subsys() are updated to use local variable @css instead of using cgroup_css(). This change also slightly simplifies error path of cgroup_create(). While this patch changes when cgroup->subsys[] is initialized, this change isn't visible to subsystems or userland. v2: This patch wasn't updated accordingly after the previous "cgroup: reorganize css init / exit paths" was updated leading to missing a css_ar[] conversion in cgroup_create() and thus boot failure. Fix it. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a1ebc445f350..b9f736c3b36d 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4321,7 +4321,6 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->flags |= CSS_ROOT; BUG_ON(cgroup_css(cgrp, ss->subsys_id)); - rcu_assign_pointer(cgrp->subsys[ss->subsys_id], css); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4334,8 +4333,10 @@ static int online_css(struct cgroup_subsys_state *css) if (ss->css_online) ret = ss->css_online(css); - if (!ret) + if (!ret) { css->flags |= CSS_ONLINE; + rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); + } return ret; } @@ -4366,6 +4367,7 @@ static void offline_css(struct cgroup_subsys_state *css) static long cgroup_create(struct cgroup *parent, struct dentry *dentry, umode_t mode) { + struct cgroup_subsys_state *css_ar[CGROUP_SUBSYS_COUNT] = { }; struct cgroup *cgrp; struct cgroup_name *name; struct cgroupfs_root *root = parent->root; @@ -4433,12 +4435,11 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err = PTR_ERR(css); goto err_free_all; } + css_ar[ss->subsys_id] = css; err = percpu_ref_init(&css->refcnt, css_release); - if (err) { - ss->css_free(css); + if (err) goto err_free_all; - } init_css(css, ss, cgrp); @@ -4467,7 +4468,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* each css holds a ref to the cgroup's dentry and the parent css */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); percpu_ref_get(&css->parent->refcnt); @@ -4478,7 +4479,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, /* creation succeeded, notify subsystems */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; err = online_css(css); if (err) @@ -4511,7 +4512,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, err_free_all: for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; if (css) { percpu_ref_cancel_init(&css->refcnt); @@ -4793,7 +4794,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) * need to invoke fork callbacks here. */ BUG_ON(!list_empty(&init_task.tasks)); - BUG_ON(online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id))); + BUG_ON(online_css(css)); mutex_unlock(&cgroup_mutex); @@ -4897,7 +4898,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) } write_unlock(&css_set_lock); - ret = online_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ret = online_css(css); if (ret) goto err_unload; -- cgit v1.3-8-gc7d7 From 223dbc38d2a8745a93749dc75ed909e274ce075d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: bounce cgroup_subsys_state ref kill confirmation to a work item css (cgroup_subsys_state) offlining, which requires process context, will be moved to ref kill confirmation. In preparation, bounce css_killed handling through css->destroy_work. css_ref_killed_fn() is renamed to css_killed_ref_fn() so that it's consistent with the new css_killed_work_fn(). This patch adds an additional work item bouncing but doesn't change the actual logic. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9f736c3b36d..398ffbbee32f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4555,12 +4555,27 @@ static void cgroup_css_killed(struct cgroup *cgrp) schedule_work(&cgrp->destroy_work); } -static void css_ref_killed_fn(struct percpu_ref *ref) +/* + * This is called when the refcnt of a css is confirmed to be killed. + * css_tryget() is now guaranteed to fail. + */ +static void css_killed_work_fn(struct work_struct *work) +{ + struct cgroup_subsys_state *css = + container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; + + cgroup_css_killed(cgrp); +} + +/* css kill confirmation processing requires process context, bounce */ +static void css_killed_ref_fn(struct percpu_ref *ref) { struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - cgroup_css_killed(css->cgroup); + INIT_WORK(&css->destroy_work, css_killed_work_fn); + schedule_work(&css->destroy_work); } /** @@ -4634,7 +4649,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) percpu_ref_get(&css->refcnt); atomic_inc(&cgrp->css_kill_cnt); - percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn); + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } cgroup_css_killed(cgrp); -- cgit v1.3-8-gc7d7 From f20104de55a212a9742d8df1807f1f29dc95b748 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: replace cgroup->css_kill_cnt with ->nr_css Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. With the planned unified hierarchy, css's will be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, css's will be individually RCU protected instead of being tied to the cgroup. cgroup->css_kill_cnt is used during cgroup destruction to wait for css reference count disable; however, this model doesn't work once css's lifetimes are managed separately from cgroup's. This patch replaces it with cgroup->nr_css which is an cgroup_mutex protected integer counting the number of attached css's. The count is incremented from online_css() and decremented after refcnt kill is confirmed. If the count reaches zero and the cgroup is marked dead, the second stage of cgroup destruction is kicked off. If a cgroup doesn't have any css attached at the time of rmdir, cgroup_destroy_locked() now invokes the second stage directly as no css kill confirmation would happen. cgroup_offline_fn() - the second step of cgroup destruction - is renamed to cgroup_destroy_css_killed() and now expects to be called with cgroup_mutex held. While this patch changes how css destruction is punted to work items, it shouldn't change any visible behavior. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 4 +++- kernel/cgroup.c | 52 +++++++++++++++++++++++++++----------------------- 2 files changed, 31 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index eb200b5794e7..80dca872f4d4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -171,6 +171,9 @@ struct cgroup { */ int id; + /* the number of attached css's */ + int nr_css; + /* * We link our 'sibling' struct into our parent's 'children'. * Our children link their 'sibling' into our 'children'. @@ -234,7 +237,6 @@ struct cgroup { /* For css percpu_ref killing and RCU-protected deletion */ struct rcu_head rcu_head; struct work_struct destroy_work; - atomic_t css_kill_cnt; /* List of events which userspace want to receive */ struct list_head event_list; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 398ffbbee32f..174f4c3d72ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -218,7 +218,7 @@ static int need_forkexit_callback __read_mostly; static struct cftype cgroup_base_files[]; -static void cgroup_offline_fn(struct work_struct *work); +static void cgroup_destroy_css_killed(struct cgroup *cgrp); static int cgroup_destroy_locked(struct cgroup *cgrp); static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], bool is_add); @@ -4335,6 +4335,7 @@ static int online_css(struct cgroup_subsys_state *css) ret = ss->css_online(css); if (!ret) { css->flags |= CSS_ONLINE; + css->cgroup->nr_css++; rcu_assign_pointer(css->cgroup->subsys[ss->subsys_id], css); } return ret; @@ -4545,16 +4546,6 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return cgroup_create(c_parent, dentry, mode | S_IFDIR); } -static void cgroup_css_killed(struct cgroup *cgrp) -{ - if (!atomic_dec_and_test(&cgrp->css_kill_cnt)) - return; - - /* percpu ref's of all css's are killed, kick off the next step */ - INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn); - schedule_work(&cgrp->destroy_work); -} - /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget() is now guaranteed to fail. @@ -4565,7 +4556,17 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup *cgrp = css->cgroup; - cgroup_css_killed(cgrp); + mutex_lock(&cgroup_mutex); + + /* + * If @cgrp is marked dead, it's waiting for refs of all css's to + * be disabled before proceeding to the second phase of cgroup + * destruction. If we are the last one, kick it off. + */ + if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + cgroup_destroy_css_killed(cgrp); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -4634,11 +4635,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * Use percpu_ref_kill_and_confirm() to get notifications as each * css is confirmed to be seen as killed on all CPUs. The * notification callback keeps track of the number of css's to be - * killed and schedules cgroup_offline_fn() to perform the rest of - * destruction once the percpu refs of all css's are confirmed to - * be killed. + * killed and invokes cgroup_destroy_css_killed() to perform the + * rest of destruction once the percpu refs of all css's are + * confirmed to be killed. */ - atomic_set(&cgrp->css_kill_cnt, 1); for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4648,10 +4648,8 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) */ percpu_ref_get(&css->refcnt); - atomic_inc(&cgrp->css_kill_cnt); percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); } - cgroup_css_killed(cgrp); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4668,6 +4666,15 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) list_del_init(&cgrp->release_list); raw_spin_unlock(&release_list_lock); + /* + * If @cgrp has css's attached, the second stage of cgroup + * destruction is kicked off from css_killed_work_fn() after the + * refs of all attached css's are killed. If @cgrp doesn't have + * any css, we kick it off here. + */ + if (!cgrp->nr_css) + cgroup_destroy_css_killed(cgrp); + /* * Clear and remove @cgrp directory. The removal puts the base ref * but we aren't quite done with @cgrp yet, so hold onto it. @@ -4693,7 +4700,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) }; /** - * cgroup_offline_fn - the second step of cgroup destruction + * cgroup_destroy_css_killed - the second step of cgroup destruction * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being @@ -4702,14 +4709,13 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * is the second step of destruction described in the comment above * cgroup_destroy_locked(). */ -static void cgroup_offline_fn(struct work_struct *work) +static void cgroup_destroy_css_killed(struct cgroup *cgrp) { - struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; struct cgroup_subsys *ss; - mutex_lock(&cgroup_mutex); + lockdep_assert_held(&cgroup_mutex); /* * css_tryget() is guaranteed to fail now. Tell subsystems to @@ -4743,8 +4749,6 @@ static void cgroup_offline_fn(struct work_struct *work) set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); - - mutex_unlock(&cgroup_mutex); } static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) -- cgit v1.3-8-gc7d7 From 09a503ea3a816b285b0b402b7f785eaec0c7a7e1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:50 -0400 Subject: cgroup: decouple cgroup_subsys_state destruction from cgroup destruction Currently, css (cgroup_subsys_state) lifetime is tied to that of the associated cgroup. css's are created when the associated cgroup is created and destroyed when it gets destroyed. Also, individual css's aren't RCU protected but the whole cgroup is. With the planned unified hierarchy, css's will need to be dynamically created and destroyed within the lifetime of a cgroup. To enable such usages, this patch decouples css destruction from cgroup destruction - offline_css() invocation and the final css_put() are moved from cgroup_destroy_css_killed() to css_killed_work_fn(). Now each css is individually offlined and put as its reference count is killed instead of waiting for all css's attached to the cgroup to finish refcnt killing and then proceeding to offlining and putting them together. While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 2 +- kernel/cgroup.c | 52 +++++++++++++++++++++++--------------------------- 2 files changed, 25 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 80dca872f4d4..71e77e7cdb6f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -82,7 +82,7 @@ struct cgroup_subsys_state { /* ID for this css, if possible */ struct css_id __rcu *id; - /* Used to put @cgroup->dentry on the last css_put() */ + /* percpu_ref killing and putting dentry on the last css_put() */ struct work_struct destroy_work; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 174f4c3d72ef..3c4c4b01ffe5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4355,6 +4355,7 @@ static void offline_css(struct cgroup_subsys_state *css) ss->css_offline(css); css->flags &= ~CSS_ONLINE; + css->cgroup->nr_css--; } /* @@ -4558,15 +4559,30 @@ static void css_killed_work_fn(struct work_struct *work) mutex_lock(&cgroup_mutex); + /* + * css_tryget() is guaranteed to fail now. Tell subsystems to + * initate destruction. + */ + offline_css(css); + /* * If @cgrp is marked dead, it's waiting for refs of all css's to * be disabled before proceeding to the second phase of cgroup * destruction. If we are the last one, kick it off. */ - if (!--cgrp->nr_css && cgroup_is_dead(cgrp)) + if (!cgrp->nr_css && cgroup_is_dead(cgrp)) cgroup_destroy_css_killed(cgrp); mutex_unlock(&cgroup_mutex); + + /* + * Put the css refs from kill_css(). Each css holds an extra + * reference to the cgroup's dentry and cgroup removal proceeds + * regardless of css refs. On the last put of each css, whenever + * that may be, the extra dentry ref is put so that dentry + * destruction happens only after all css's are released. + */ + css_put(css); } /* css kill confirmation processing requires process context, bounce */ @@ -4633,11 +4649,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * as killed on all CPUs on return. * * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. The - * notification callback keeps track of the number of css's to be - * killed and invokes cgroup_destroy_css_killed() to perform the - * rest of destruction once the percpu refs of all css's are - * confirmed to be killed. + * css is confirmed to be seen as killed on all CPUs. + * cgroup_destroy_css_killed() will be invoked to perform the rest + * of destruction once the percpu refs of all css's are confirmed + * to be killed. */ for_each_root_subsys(cgrp->root, ss) { struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); @@ -4704,36 +4719,17 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * @work: cgroup->destroy_free_work * * This function is invoked from a work item for a cgroup which is being - * destroyed after the percpu refcnts of all css's are guaranteed to be - * seen as killed on all CPUs, and performs the rest of destruction. This - * is the second step of destruction described in the comment above - * cgroup_destroy_locked(). + * destroyed after all css's are offlined and performs the rest of + * destruction. This is the second step of destruction described in the + * comment above cgroup_destroy_locked(). */ static void cgroup_destroy_css_killed(struct cgroup *cgrp) { struct cgroup *parent = cgrp->parent; struct dentry *d = cgrp->dentry; - struct cgroup_subsys *ss; lockdep_assert_held(&cgroup_mutex); - /* - * css_tryget() is guaranteed to fail now. Tell subsystems to - * initate destruction. - */ - for_each_root_subsys(cgrp->root, ss) - offline_css(cgroup_css(cgrp, ss->subsys_id)); - - /* - * Put the css refs from cgroup_destroy_locked(). Each css holds - * an extra reference to the cgroup's dentry and cgroup removal - * proceeds regardless of css refs. On the last put of each css, - * whenever that may be, the extra dentry ref is put so that dentry - * destruction happens only after all css's are released. - */ - for_each_root_subsys(cgrp->root, ss) - css_put(cgroup_css(cgrp, ss->subsys_id)); - /* delete this cgroup from parent->children */ list_del_rcu(&cgrp->sibling); -- cgit v1.3-8-gc7d7 From edae0c3358947f8be5ca99f762d89e0c38e1f5d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: factor out kill_css() Factor out css ref killing from cgroup_destroy_locked() into kill_css(). We're gonna add more to the path and the factored out function will eventually be called from other places too. While at it, replace open coded percpu_ref_get() with css_get() for consistency. This shouldn't cause any functional difference as the function is not used for root cgroups. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 58 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 35 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3c4c4b01ffe5..7b7575f3119c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4595,6 +4595,36 @@ static void css_killed_ref_fn(struct percpu_ref *ref) schedule_work(&css->destroy_work); } +/** + * kill_css - destroy a css + * @css: css to destroy + * + * This function initiates destruction of @css by putting its base + * reference. ->css_offline() will be invoked asynchronously once + * css_tryget() is guaranteed to fail and when the reference count reaches + * zero, @css will be released. + */ +static void kill_css(struct cgroup_subsys_state *css) +{ + /* + * Killing would put the base ref, but we need to keep it alive + * until after ->css_offline(). + */ + css_get(css); + + /* + * cgroup core guarantees that, by the time ->css_offline() is + * invoked, no new css reference will be given out via + * css_tryget(). We can't simply call percpu_ref_kill() and + * proceed to offlining css's because percpu_ref_kill() doesn't + * guarantee that the ref is seen as killed on all CPUs on return. + * + * Use percpu_ref_kill_and_confirm() to get notifications as each + * css is confirmed to be seen as killed on all CPUs. + */ + percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); +} + /** * cgroup_destroy_locked - the first stage of cgroup destruction * @cgrp: cgroup to be destroyed @@ -4641,30 +4671,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Block new css_tryget() by killing css refcnts. cgroup core - * guarantees that, by the time ->css_offline() is invoked, no new - * css reference will be given out via css_tryget(). We can't - * simply call percpu_ref_kill() and proceed to offlining css's - * because percpu_ref_kill() doesn't guarantee that the ref is seen - * as killed on all CPUs on return. - * - * Use percpu_ref_kill_and_confirm() to get notifications as each - * css is confirmed to be seen as killed on all CPUs. - * cgroup_destroy_css_killed() will be invoked to perform the rest - * of destruction once the percpu refs of all css's are confirmed - * to be killed. + * Initiate massacre of all css's. cgroup_destroy_css_killed() + * will be invoked to perform the rest of destruction once the + * percpu refs of all css's are confirmed to be killed. */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - /* - * Killing would put the base ref, but we need to keep it - * alive until after ->css_offline. - */ - percpu_ref_get(&css->refcnt); - - percpu_ref_kill_and_confirm(&css->refcnt, css_killed_ref_fn); - } + for_each_root_subsys(cgrp->root, ss) + kill_css(cgroup_css(cgrp, ss->subsys_id)); /* * Mark @cgrp dead. This prevents further task migration and child -- cgit v1.3-8-gc7d7 From 3c14f8b44fafaa60519440bea1591e495b928327 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: move subsys file removal to kill_css() With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. This patch moves subsys file removal from cgroup_destroy_locked() to kill_css(). While this changes the order of destruction operations, the changes shouldn't be noticeable to cgroup subsystems or userland. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7b7575f3119c..3137e38995b0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4599,13 +4599,15 @@ static void css_killed_ref_fn(struct percpu_ref *ref) * kill_css - destroy a css * @css: css to destroy * - * This function initiates destruction of @css by putting its base - * reference. ->css_offline() will be invoked asynchronously once - * css_tryget() is guaranteed to fail and when the reference count reaches - * zero, @css will be released. + * This function initiates destruction of @css by removing cgroup interface + * files and putting its base reference. ->css_offline() will be invoked + * asynchronously once css_tryget() is guaranteed to fail and when the + * reference count reaches zero, @css will be released. */ static void kill_css(struct cgroup_subsys_state *css) { + cgroup_clear_dir(css->cgroup, 1 << css->ss->subsys_id); + /* * Killing would put the base ref, but we need to keep it alive * until after ->css_offline(). @@ -4703,10 +4705,10 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) cgroup_destroy_css_killed(cgrp); /* - * Clear and remove @cgrp directory. The removal puts the base ref - * but we aren't quite done with @cgrp yet, so hold onto it. + * Clear the base files and remove @cgrp directory. The removal + * puts the base ref but we aren't quite done with @cgrp yet, so + * hold onto it. */ - cgroup_clear_dir(cgrp, cgrp->root->subsys_mask); cgroup_addrm_files(cgrp, cgroup_base_files, false); dget(d); cgroup_d_remove_dir(d); -- cgit v1.3-8-gc7d7 From 0c21ead136a900c36f1ab74fd7d09a306dc31324 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 13 Aug 2013 20:22:51 -0400 Subject: cgroup: RCU protect each cgroup_subsys_state release With the planned unified hierarchy, individual css's will be created and destroyed dynamically across the lifetime of a cgroup. To enable such usages, css destruction is being decoupled from cgroup destruction. Most of the destruction path has been decoupled but the actual free of css still depends on cgroup free path. When all css refs are drained, css_release() kicks off css_free_work_fn() which puts the cgroup. When the cgroup refcnt reaches zero, cgroup_diput() is invoked which in turn schedules RCU free of the cgroup. After a grace period, all css's are freed along with the cgroup itself. This patch moves the RCU grace period and css freeing from cgroup release path to css release path. css_release(), instead of kicking off css_free_work_fn() directly, schedules RCU callback css_free_rcu_fn() which in turn kicks off css_free_work_fn() after a RCU grace period. css_free_work_fn() is updated to free the css directly. The five-way punting - percpu ref kill confirmation, a work item, percpu ref release, RCU grace period, and again a work item - is quite hairy but the work items are there only to provide process context and the actual sequence is kill confirm -> release -> RCU free, which isn't simple but not too crazy. This removes cgroup_css() usage after offline_css() allowing clearing cgroup->subsys[] from offline_css(), which makes it consistent with online_css() and brings it closer to proper lifetime management for individual css's. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- include/linux/cgroup.h | 3 ++- kernel/cgroup.c | 53 +++++++++++++++++++++++++++++++++++--------------- 2 files changed, 39 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 71e77e7cdb6f..c24bd0b9f93a 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -82,7 +82,8 @@ struct cgroup_subsys_state { /* ID for this css, if possible */ struct css_id __rcu *id; - /* percpu_ref killing and putting dentry on the last css_put() */ + /* percpu_ref killing and RCU release */ + struct rcu_head rcu_head; struct work_struct destroy_work; }; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3137e38995b0..66d01078eebe 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -869,18 +869,8 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry) static void cgroup_free_fn(struct work_struct *work) { struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work); - struct cgroup_subsys *ss; mutex_lock(&cgroup_mutex); - /* - * Release the subsystem state objects. - */ - for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); - - ss->css_free(css); - } - cgrp->root->number_of_cgroups--; mutex_unlock(&cgroup_mutex); @@ -4281,32 +4271,62 @@ err: return ret; } +/* + * css destruction is four-stage process. + * + * 1. Destruction starts. Killing of the percpu_ref is initiated. + * Implemented in kill_css(). + * + * 2. When the percpu_ref is confirmed to be visible as killed on all CPUs + * and thus css_tryget() is guaranteed to fail, the css can be offlined + * by invoking offline_css(). After offlining, the base ref is put. + * Implemented in css_killed_work_fn(). + * + * 3. When the percpu_ref reaches zero, the only possible remaining + * accessors are inside RCU read sections. css_release() schedules the + * RCU callback. + * + * 4. After the grace period, the css can be freed. Implemented in + * css_free_work_fn(). + * + * It is actually hairier because both step 2 and 4 require process context + * and thus involve punting to css->destroy_work adding two additional + * steps to the already complex sequence. + */ static void css_free_work_fn(struct work_struct *work) { struct cgroup_subsys_state *css = container_of(work, struct cgroup_subsys_state, destroy_work); + struct cgroup *cgrp = css->cgroup; if (css->parent) css_put(css->parent); - cgroup_dput(css->cgroup); + css->ss->css_free(css); + cgroup_dput(cgrp); } -static void css_release(struct percpu_ref *ref) +static void css_free_rcu_fn(struct rcu_head *rcu_head) { struct cgroup_subsys_state *css = - container_of(ref, struct cgroup_subsys_state, refcnt); + container_of(rcu_head, struct cgroup_subsys_state, rcu_head); /* * css holds an extra ref to @cgrp->dentry which is put on the last - * css_put(). dput() requires process context, which css_put() may - * be called without. @css->destroy_work will be used to invoke - * dput() asynchronously from css_put(). + * css_put(). dput() requires process context which we don't have. */ INIT_WORK(&css->destroy_work, css_free_work_fn); schedule_work(&css->destroy_work); } +static void css_release(struct percpu_ref *ref) +{ + struct cgroup_subsys_state *css = + container_of(ref, struct cgroup_subsys_state, refcnt); + + call_rcu(&css->rcu_head, css_free_rcu_fn); +} + static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, struct cgroup *cgrp) { @@ -4356,6 +4376,7 @@ static void offline_css(struct cgroup_subsys_state *css) css->flags &= ~CSS_ONLINE; css->cgroup->nr_css--; + RCU_INIT_POINTER(css->cgroup->subsys[ss->subsys_id], css); } /* -- cgit v1.3-8-gc7d7 From ff58ac0d58d51bffe868b239ed8fce7c4a23c5a9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 13 Aug 2013 09:17:33 +0800 Subject: cpuset: remove an unncessary forward declaration Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cpuset.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 72a0383f382f..95f4b25e1538 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -68,9 +68,6 @@ */ int number_of_cpusets __read_mostly; -/* Forward declare cgroup structures */ -struct cgroup_subsys cpuset_subsys; - /* See "Frequency meter" comments, below. */ struct fmeter { -- cgit v1.3-8-gc7d7 From 65f382fd0c8fa483713c0971de9f1dfb4cf1ad9c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Jul 2013 19:12:32 +0200 Subject: context_tracking: Ground setup for static key use Prepare for using a static key in the context tracking subsystem. This will help optimizing the off case on its many users: * user_enter, user_exit, exception_enter, exception_exit, guest_enter, guest_exit, vtime_*() Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 10 ++++++++++ init/main.c | 2 ++ kernel/context_tracking.c | 23 +++++++++++++++++------ 3 files changed, 29 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 1ae37c708c63..c138c24bad1a 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -4,6 +4,7 @@ #include #include #include +#include #include struct context_tracking { @@ -22,6 +23,7 @@ struct context_tracking { #ifdef CONFIG_CONTEXT_TRACKING +extern struct static_key context_tracking_enabled; DECLARE_PER_CPU(struct context_tracking, context_tracking); static inline bool context_tracking_in_user(void) @@ -67,6 +69,14 @@ static inline void context_tracking_task_switch(struct task_struct *prev, struct task_struct *next) { } #endif /* !CONFIG_CONTEXT_TRACKING */ + +#ifdef CONFIG_CONTEXT_TRACKING_FORCE +extern void context_tracking_init(void); +#else +static inline void context_tracking_init(void) { } +#endif /* CONFIG_CONTEXT_TRACKING_FORCE */ + + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void guest_enter(void); extern void guest_exit(void); diff --git a/init/main.c b/init/main.c index d03d2ec2eacf..af310afbef28 100644 --- a/init/main.c +++ b/init/main.c @@ -75,6 +75,7 @@ #include #include #include +#include #include #include @@ -545,6 +546,7 @@ asmlinkage void __init start_kernel(void) idr_init_cache(); rcu_init(); tick_nohz_init(); + context_tracking_init(); radix_tree_init(); /* init some links before init_ISA_irqs() */ early_irq_init(); diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 72bcb2570d3e..839d377d0da5 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -20,15 +20,16 @@ #include #include -DEFINE_PER_CPU(struct context_tracking, context_tracking) = { -#ifdef CONFIG_CONTEXT_TRACKING_FORCE - .active = true, -#endif -}; +struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; + +DEFINE_PER_CPU(struct context_tracking, context_tracking); void context_tracking_cpu_set(int cpu) { - per_cpu(context_tracking.active, cpu) = true; + if (!per_cpu(context_tracking.active, cpu)) { + per_cpu(context_tracking.active, cpu) = true; + static_key_slow_inc(&context_tracking_enabled); + } } /** @@ -202,3 +203,13 @@ void context_tracking_task_switch(struct task_struct *prev, clear_tsk_thread_flag(prev, TIF_NOHZ); set_tsk_thread_flag(next, TIF_NOHZ); } + +#ifdef CONFIG_CONTEXT_TRACKING_FORCE +void __init context_tracking_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + context_tracking_cpu_set(cpu); +} +#endif -- cgit v1.3-8-gc7d7 From ad65782fba507d91a0a98f519b59e79cac1b474c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Jul 2013 02:44:35 +0200 Subject: context_tracking: Optimize main APIs off case with static key Optimize user and exception entry/exit APIs with static keys. This minimize the overhead for those who enable CONFIG_NO_HZ_FULL without always using it. Having no range passed to nohz_full= should result in the probes to be nopped (at least we hope so...). If this proves not be enough in the long term, we'll need to bring an exception slow path by re-routing the exception handlers. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 27 ++++++++++++++++++++++----- kernel/context_tracking.c | 12 ++++++------ 2 files changed, 28 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index c138c24bad1a..38ab60b3f3a6 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -38,23 +38,40 @@ static inline bool context_tracking_active(void) extern void context_tracking_cpu_set(int cpu); -extern void user_enter(void); -extern void user_exit(void); +extern void context_tracking_user_enter(void); +extern void context_tracking_user_exit(void); + +static inline void user_enter(void) +{ + if (static_key_false(&context_tracking_enabled)) + context_tracking_user_enter(); + +} +static inline void user_exit(void) +{ + if (static_key_false(&context_tracking_enabled)) + context_tracking_user_exit(); +} static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; + if (!static_key_false(&context_tracking_enabled)) + return 0; + prev_ctx = this_cpu_read(context_tracking.state); - user_exit(); + context_tracking_user_exit(); return prev_ctx; } static inline void exception_exit(enum ctx_state prev_ctx) { - if (prev_ctx == IN_USER) - user_enter(); + if (static_key_false(&context_tracking_enabled)) { + if (prev_ctx == IN_USER) + context_tracking_user_enter(); + } } extern void context_tracking_task_switch(struct task_struct *prev, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 839d377d0da5..6e89e094c80e 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -33,15 +33,15 @@ void context_tracking_cpu_set(int cpu) } /** - * user_enter - Inform the context tracking that the CPU is going to - * enter userspace mode. + * context_tracking_user_enter - Inform the context tracking that the CPU is going to + * enter userspace mode. * * This function must be called right before we switch from the kernel * to userspace, when it's guaranteed the remaining kernel instructions * to execute won't use any RCU read side critical section because this * function sets RCU in extended quiescent state. */ -void user_enter(void) +void context_tracking_user_enter(void) { unsigned long flags; @@ -131,8 +131,8 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); #endif /* CONFIG_PREEMPT */ /** - * user_exit - Inform the context tracking that the CPU is - * exiting userspace mode and entering the kernel. + * context_tracking_user_exit - Inform the context tracking that the CPU is + * exiting userspace mode and entering the kernel. * * This function must be called after we entered the kernel from userspace * before any use of RCU read side critical section. This potentially include @@ -141,7 +141,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_context); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void user_exit(void) +void context_tracking_user_exit(void) { unsigned long flags; -- cgit v1.3-8-gc7d7 From 48d6a816a8bf36e2a197c322697323003bdc1cfe Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 10 Jul 2013 02:44:35 +0200 Subject: context_tracking: Optimize guest APIs off case with static key Optimize guest entry/exit APIs with static keys. This minimize the overhead for those who enable CONFIG_NO_HZ_FULL without always using it. Having no range passed to nohz_full= should result in the probes overhead to be minimized. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 19 +++++++++++++++++-- kernel/context_tracking.c | 23 ++--------------------- kernel/sched/cputime.c | 2 ++ 3 files changed, 21 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 38ab60b3f3a6..8854eadb2142 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -95,8 +95,23 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -extern void guest_enter(void); -extern void guest_exit(void); +static inline void guest_enter(void) +{ + if (static_key_false(&context_tracking_enabled) && + vtime_accounting_enabled()) + vtime_guest_enter(current); + else + current->flags |= PF_VCPU; +} + +static inline void guest_exit(void) +{ + if (static_key_false(&context_tracking_enabled) && + vtime_accounting_enabled()) + vtime_guest_exit(current); + else + current->flags &= ~PF_VCPU; +} #else static inline void guest_enter(void) { diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 6e89e094c80e..b6a186c4b886 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -21,8 +21,10 @@ #include struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); +EXPORT_SYMBOL_GPL(context_tracking); void context_tracking_cpu_set(int cpu) { @@ -163,27 +165,6 @@ void context_tracking_user_exit(void) local_irq_restore(flags); } -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -void guest_enter(void) -{ - if (vtime_accounting_enabled()) - vtime_guest_enter(current); - else - current->flags |= PF_VCPU; -} -EXPORT_SYMBOL_GPL(guest_enter); - -void guest_exit(void) -{ - if (vtime_accounting_enabled()) - vtime_guest_exit(current); - else - current->flags &= ~PF_VCPU; -} -EXPORT_SYMBOL_GPL(guest_exit); -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ - - /** * context_tracking_task_switch - context switch the syscall callbacks * @prev: the task that is being switched out diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 223a35efa0a6..bb6b29a3067c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -724,6 +724,7 @@ void vtime_guest_enter(struct task_struct *tsk) current->flags |= PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { @@ -732,6 +733,7 @@ void vtime_guest_exit(struct task_struct *tsk) current->flags &= ~PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { -- cgit v1.3-8-gc7d7 From 73d424f9af7b571276e6284617cb59726d47bf12 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Jul 2013 19:42:13 +0200 Subject: context_tracking: Optimize context switch off case with static keys No need for syscall slowpath if no CPU is full dynticks, rather nop this in this case. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 11 +++++++++-- kernel/context_tracking.c | 6 +++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 8854eadb2142..e070ea5dadac 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -40,6 +40,8 @@ extern void context_tracking_cpu_set(int cpu); extern void context_tracking_user_enter(void); extern void context_tracking_user_exit(void); +extern void __context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next); static inline void user_enter(void) { @@ -74,8 +76,12 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } -extern void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next); +static inline void context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) +{ + if (static_key_false(&context_tracking_enabled)) + __context_tracking_task_switch(prev, next); +} #else static inline bool context_tracking_in_user(void) { return false; } static inline void user_enter(void) { } @@ -112,6 +118,7 @@ static inline void guest_exit(void) else current->flags &= ~PF_VCPU; } + #else static inline void guest_enter(void) { diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index b6a186c4b886..c17822673c39 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -166,7 +166,7 @@ void context_tracking_user_exit(void) } /** - * context_tracking_task_switch - context switch the syscall callbacks + * __context_tracking_task_switch - context switch the syscall callbacks * @prev: the task that is being switched out * @next: the task that is being switched in * @@ -178,8 +178,8 @@ void context_tracking_user_exit(void) * migrate to some CPU that doesn't do the context tracking. As such the TIF * flag may not be desired there. */ -void context_tracking_task_switch(struct task_struct *prev, - struct task_struct *next) +void __context_tracking_task_switch(struct task_struct *prev, + struct task_struct *next) { clear_tsk_thread_flag(prev, TIF_NOHZ); set_tsk_thread_flag(next, TIF_NOHZ); -- cgit v1.3-8-gc7d7 From 1b6a259aa5ab16d8b215bfc19ff7c9ffa8858f10 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 11 Jul 2013 20:27:43 +0200 Subject: context_tracking: User/kernel broundary cross trace events This can be useful to track all kernel/user round trips. And it's also helpful to debug the context tracking subsystem. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/trace/events/context_tracking.h | 58 +++++++++++++++++++++++++++++++++ kernel/context_tracking.c | 5 +++ 2 files changed, 63 insertions(+) create mode 100644 include/trace/events/context_tracking.h (limited to 'kernel') diff --git a/include/trace/events/context_tracking.h b/include/trace/events/context_tracking.h new file mode 100644 index 000000000000..ce8007cf29cf --- /dev/null +++ b/include/trace/events/context_tracking.h @@ -0,0 +1,58 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM context_tracking + +#if !defined(_TRACE_CONTEXT_TRACKING_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_CONTEXT_TRACKING_H + +#include + +DECLARE_EVENT_CLASS(context_tracking_user, + + TP_PROTO(int dummy), + + TP_ARGS(dummy), + + TP_STRUCT__entry( + __field( int, dummy ) + ), + + TP_fast_assign( + __entry->dummy = dummy; + ), + + TP_printk("%s", "") +); + +/** + * user_enter - called when the kernel resumes to userspace + * @dummy: dummy arg to make trace event macro happy + * + * This event occurs when the kernel resumes to userspace after + * an exception or a syscall. + */ +DEFINE_EVENT(context_tracking_user, user_enter, + + TP_PROTO(int dummy), + + TP_ARGS(dummy) +); + +/** + * user_exit - called when userspace enters the kernel + * @dummy: dummy arg to make trace event macro happy + * + * This event occurs when userspace enters the kernel through + * an exception or a syscall. + */ +DEFINE_EVENT(context_tracking_user, user_exit, + + TP_PROTO(int dummy), + + TP_ARGS(dummy) +); + + +#endif /* _TRACE_CONTEXT_TRACKING_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index c17822673c39..247091bf0587 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -20,6 +20,9 @@ #include #include +#define CREATE_TRACE_POINTS +#include + struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; EXPORT_SYMBOL_GPL(context_tracking_enabled); @@ -64,6 +67,7 @@ void context_tracking_user_enter(void) local_irq_save(flags); if ( __this_cpu_read(context_tracking.state) != IN_USER) { if (__this_cpu_read(context_tracking.active)) { + trace_user_enter(0); /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be @@ -159,6 +163,7 @@ void context_tracking_user_exit(void) */ rcu_user_exit(); vtime_user_exit(current); + trace_user_exit(0); } __this_cpu_write(context_tracking.state, IN_KERNEL); } -- cgit v1.3-8-gc7d7 From 7621d1f8bcb418e7a7ac583e89e38ec01b7ed182 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Jul 2013 17:07:35 +0200 Subject: vtime: Remove a few unneeded generic vtime state checks Some generic vtime APIs check if the vtime accounting is enabled on the local CPU before doing their work. Some of these are not needed because all their callers already take care of that. Let's remove the checks on these. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bb6b29a3067c..5f273b477764 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -664,9 +664,6 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); @@ -686,12 +683,7 @@ void vtime_account_irq_exit(struct task_struct *tsk) void vtime_account_user(struct task_struct *tsk) { - cputime_t delta_cpu; - - if (!vtime_accounting_enabled()) - return; - - delta_cpu = get_vtime_delta(tsk); + cputime_t delta_cpu = get_vtime_delta(tsk); write_seqlock(&tsk->vtime_seqlock); tsk->vtime_snap_whence = VTIME_SYS; @@ -701,9 +693,6 @@ void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); -- cgit v1.3-8-gc7d7 From 54461562c90e0ac104764c5a9de637fd9151a1c1 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Jul 2013 17:10:18 +0200 Subject: vtime: Fix racy cputime delta update get_vtime_delta() must be called under the task vtime_seqlock with the code that does the cputime accounting flush. Otherwise the cputime reader can be fooled and run into a race where it sees the snapshot update but misses the cputime flush. As a result it can report a cputime that is way too short. Fix vtime_account_user() that wasn't complying to that rule. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5f273b477764..b62d5c027c7e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -683,9 +683,10 @@ void vtime_account_irq_exit(struct task_struct *tsk) void vtime_account_user(struct task_struct *tsk) { - cputime_t delta_cpu = get_vtime_delta(tsk); + cputime_t delta_cpu; write_seqlock(&tsk->vtime_seqlock); + delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); write_sequnlock(&tsk->vtime_seqlock); -- cgit v1.3-8-gc7d7 From b04934061330a4a449cfce703c97d887c3e11cd7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 12 Jul 2013 03:10:15 +0200 Subject: vtime: Optimize full dynticks accounting off case with static keys If no CPU is in the full dynticks range, we can avoid the full dynticks cputime accounting through generic vtime along with its overhead and use the traditional tick based accounting instead. Let's do this and nope the off case with static keys. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/context_tracking.h | 6 ++-- include/linux/vtime.h | 70 ++++++++++++++++++++++++++++++++++------ kernel/sched/cputime.c | 22 +++---------- 3 files changed, 67 insertions(+), 31 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 82ec4870e064..158158704c30 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -74,8 +74,7 @@ static inline void context_tracking_init(void) { } #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN static inline void guest_enter(void) { - if (static_key_false(&context_tracking_enabled) && - vtime_accounting_enabled()) + if (vtime_accounting_enabled()) vtime_guest_enter(current); else current->flags |= PF_VCPU; @@ -83,8 +82,7 @@ static inline void guest_enter(void) static inline void guest_exit(void) { - if (static_key_false(&context_tracking_enabled) && - vtime_accounting_enabled()) + if (vtime_accounting_enabled()) vtime_guest_exit(current); else current->flags &= ~PF_VCPU; diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 2ad073915e8c..f5b72b364bda 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -1,22 +1,68 @@ #ifndef _LINUX_KERNEL_VTIME_H #define _LINUX_KERNEL_VTIME_H +#include #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE #include #endif + struct task_struct; +/* + * vtime_accounting_enabled() definitions/declarations + */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE +static inline bool vtime_accounting_enabled(void) { return true; } +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +static inline bool vtime_accounting_enabled(void) +{ + if (static_key_false(&context_tracking_enabled)) { + if (context_tracking_active()) + return true; + } + + return false; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ + +#ifndef CONFIG_VIRT_CPU_ACCOUNTING +static inline bool vtime_accounting_enabled(void) { return false; } +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ + + +/* + * Common vtime APIs + */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifdef __ARCH_HAS_VTIME_TASK_SWITCH extern void vtime_task_switch(struct task_struct *prev); +#else +extern void vtime_common_task_switch(struct task_struct *prev); +static inline void vtime_task_switch(struct task_struct *prev) +{ + if (vtime_accounting_enabled()) + vtime_common_task_switch(prev); +} +#endif /* __ARCH_HAS_VTIME_TASK_SWITCH */ + extern void vtime_account_system(struct task_struct *tsk); extern void vtime_account_idle(struct task_struct *tsk); extern void vtime_account_user(struct task_struct *tsk); -extern void vtime_account_irq_enter(struct task_struct *tsk); -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -static inline bool vtime_accounting_enabled(void) { return true; } -#endif +#ifdef __ARCH_HAS_VTIME_ACCOUNT +extern void vtime_account_irq_enter(struct task_struct *tsk); +#else +extern void vtime_common_account_irq_enter(struct task_struct *tsk); +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (vtime_accounting_enabled()) + vtime_common_account_irq_enter(tsk); +} +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ #else /* !CONFIG_VIRT_CPU_ACCOUNTING */ @@ -24,14 +70,20 @@ static inline void vtime_task_switch(struct task_struct *prev) { } static inline void vtime_account_system(struct task_struct *tsk) { } static inline void vtime_account_user(struct task_struct *tsk) { } static inline void vtime_account_irq_enter(struct task_struct *tsk) { } -static inline bool vtime_accounting_enabled(void) { return false; } -#endif +#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern void arch_vtime_task_switch(struct task_struct *tsk); -extern void vtime_account_irq_exit(struct task_struct *tsk); -extern bool vtime_accounting_enabled(void); +extern void vtime_gen_account_irq_exit(struct task_struct *tsk); + +static inline void vtime_account_irq_exit(struct task_struct *tsk) +{ + if (vtime_accounting_enabled()) + vtime_gen_account_irq_exit(tsk); +} + extern void vtime_user_enter(struct task_struct *tsk); + static inline void vtime_user_exit(struct task_struct *tsk) { vtime_account_user(tsk); @@ -39,7 +91,7 @@ static inline void vtime_user_exit(struct task_struct *tsk) extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); extern void vtime_init_idle(struct task_struct *tsk, int cpu); -#else +#else /* !CONFIG_VIRT_CPU_ACCOUNTING_GEN */ static inline void vtime_account_irq_exit(struct task_struct *tsk) { /* On hard|softirq exit we always account to hard|softirq cputime */ diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index b62d5c027c7e..0831b06aab97 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -378,11 +378,8 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ #ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) +void vtime_common_task_switch(struct task_struct *prev) { - if (!vtime_accounting_enabled()) - return; - if (is_idle_task(prev)) vtime_account_idle(prev); else @@ -404,11 +401,8 @@ void vtime_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_common_account_irq_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - if (!in_interrupt()) { /* * If we interrupted user, context_tracking_in_user() @@ -428,7 +422,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -669,11 +663,8 @@ void vtime_account_system(struct task_struct *tsk) write_sequnlock(&tsk->vtime_seqlock); } -void vtime_account_irq_exit(struct task_struct *tsk) +void vtime_gen_account_irq_exit(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; @@ -732,11 +723,6 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(delta_cpu); } -bool vtime_accounting_enabled(void) -{ - return context_tracking_active(); -} - void arch_vtime_task_switch(struct task_struct *prev) { write_seqlock(&prev->vtime_seqlock); -- cgit v1.3-8-gc7d7 From b854fafa4e06c50a92e00b39d75ee62083d986d6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 13 Jul 2013 17:24:20 +0200 Subject: vtime: Always scale generic vtime accounting results The cputime accounting in full dynticks can be a subtle mixup of CPUs using tick based accounting and others using generic vtime. As long as the tick can have a share on producing these stats, we want to scale the result against CFS precise accounting as the tick can miss some task hiding between the periodic interrupt. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0831b06aab97..e9e742ed7280 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -553,12 +553,6 @@ static void cputime_adjust(struct task_cputime *curr, { cputime_t rtime, stime, utime, total; - if (vtime_accounting_enabled()) { - *ut = curr->utime; - *st = curr->stime; - return; - } - stime = curr->stime; total = stime + curr->utime; -- cgit v1.3-8-gc7d7 From af2350bd12096dfd04e1090b90bfecea1f75f84e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 15 Jul 2013 16:35:55 +0200 Subject: vtime: Always debug check snapshot source _before_ updating it The vtime delta update performed by get_vtime_delta() always check that the source of the snapshot is valid. Meanhile the snapshot updaters that rely on get_vtime_delta() also set the new snapshot origin. But some of them do this right before the call to get_vtime_delta(), making its debug check useless. This is easily fixable by moving the snapshot origin update after the call to get_vtime_delta(). The order doesn't matter there. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/sched/cputime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index e9e742ed7280..c1d7493825ae 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -660,9 +660,9 @@ void vtime_account_system(struct task_struct *tsk) void vtime_gen_account_irq_exit(struct task_struct *tsk) { write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } @@ -680,8 +680,8 @@ void vtime_account_user(struct task_struct *tsk) void vtime_user_enter(struct task_struct *tsk) { write_seqlock(&tsk->vtime_seqlock); - tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_sequnlock(&tsk->vtime_seqlock); } -- cgit v1.3-8-gc7d7 From 73867dcd0792ad14fb31bfe73d09d9a4576f7fc2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Jul 2013 23:31:00 +0200 Subject: nohz: Rename a few state variables Rename the full dynticks's cpumask and cpumask state variables to some more exportable names. These will be used later from global headers to optimize the main full dynticks APIs in conjunction with static keys. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- kernel/time/tick-sched.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 91a2528b5f44..b28dee43e644 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -149,8 +149,8 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) } #ifdef CONFIG_NO_HZ_FULL -static cpumask_var_t nohz_full_mask; -bool have_nohz_full_mask; +static cpumask_var_t tick_nohz_full_mask; +bool tick_nohz_full_running; static bool can_stop_full_tick(void) { @@ -183,7 +183,7 @@ static bool can_stop_full_tick(void) * Don't allow the user to think they can get * full NO_HZ with this machine. */ - WARN_ONCE(have_nohz_full_mask, + WARN_ONCE(tick_nohz_full_running, "NO_HZ FULL will not work with unstable sched clock"); return false; } @@ -240,11 +240,11 @@ static void nohz_full_kick_ipi(void *info) */ void tick_nohz_full_kick_all(void) { - if (!have_nohz_full_mask) + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(nohz_full_mask, + smp_call_function_many(tick_nohz_full_mask, nohz_full_kick_ipi, NULL, false); preempt_enable(); } @@ -272,10 +272,10 @@ out: int tick_nohz_full_cpu(int cpu) { - if (!have_nohz_full_mask) + if (!tick_nohz_full_running) return 0; - return cpumask_test_cpu(cpu, nohz_full_mask); + return cpumask_test_cpu(cpu, tick_nohz_full_mask); } /* Parse the boot-time nohz CPU list from the kernel parameters. */ @@ -283,18 +283,18 @@ static int __init tick_nohz_full_setup(char *str) { int cpu; - alloc_bootmem_cpumask_var(&nohz_full_mask); - if (cpulist_parse(str, nohz_full_mask) < 0) { + alloc_bootmem_cpumask_var(&tick_nohz_full_mask); + if (cpulist_parse(str, tick_nohz_full_mask) < 0) { pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); return 1; } cpu = smp_processor_id(); - if (cpumask_test_cpu(cpu, nohz_full_mask)) { + if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); - cpumask_clear_cpu(cpu, nohz_full_mask); + cpumask_clear_cpu(cpu, tick_nohz_full_mask); } - have_nohz_full_mask = true; + tick_nohz_full_running = true; return 1; } @@ -312,7 +312,7 @@ static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, * If we handle the timekeeping duty for full dynticks CPUs, * we can't safely shutdown that CPU. */ - if (have_nohz_full_mask && tick_do_timer_cpu == cpu) + if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return NOTIFY_BAD; break; } @@ -331,14 +331,14 @@ static int tick_nohz_init_all(void) int err = -1; #ifdef CONFIG_NO_HZ_FULL_ALL - if (!alloc_cpumask_var(&nohz_full_mask, GFP_KERNEL)) { + if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) { pr_err("NO_HZ: Can't allocate full dynticks cpumask\n"); return err; } err = 0; - cpumask_setall(nohz_full_mask); - cpumask_clear_cpu(smp_processor_id(), nohz_full_mask); - have_nohz_full_mask = true; + cpumask_setall(tick_nohz_full_mask); + cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask); + tick_nohz_full_running = true; #endif return err; } @@ -347,20 +347,20 @@ void __init tick_nohz_init(void) { int cpu; - if (!have_nohz_full_mask) { + if (!tick_nohz_full_running) { if (tick_nohz_init_all() < 0) return; } - for_each_cpu(cpu, nohz_full_mask) + for_each_cpu(cpu, tick_nohz_full_mask) context_tracking_cpu_set(cpu); cpu_notifier(tick_nohz_cpu_down_callback, 0); - cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), nohz_full_mask); + cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); } #else -#define have_nohz_full_mask (0) +#define tick_nohz_full_running (0) #endif /* @@ -738,7 +738,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (have_nohz_full_mask) { + if (tick_nohz_full_running) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around -- cgit v1.3-8-gc7d7 From 460775df4680b4593d8449bc171008578625a850 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Jul 2013 23:52:27 +0200 Subject: nohz: Optimize full dynticks state checks with static keys These APIs are frequenctly accessed and priority is given to optimize the full dynticks off-case in order to let distros enable this feature without suffering from significant performance regressions. Let's inline these APIs and optimize them with static keys. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/tick.h | 25 +++++++++++++++++++++++-- kernel/time/tick-sched.c | 14 ++------------ 2 files changed, 25 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 9180f4b85e6d..c60b079e1b37 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -10,6 +10,8 @@ #include #include #include +#include +#include #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -158,15 +160,34 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } # endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL +extern bool tick_nohz_full_running; +extern cpumask_var_t tick_nohz_full_mask; + +static inline bool tick_nohz_full_enabled(void) +{ + if (!static_key_false(&context_tracking_enabled)) + return false; + + return tick_nohz_full_running; +} + +static inline bool tick_nohz_full_cpu(int cpu) +{ + if (!tick_nohz_full_enabled()) + return false; + + return cpumask_test_cpu(cpu, tick_nohz_full_mask); +} + extern void tick_nohz_init(void); -extern int tick_nohz_full_cpu(int cpu); extern void tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); extern void tick_nohz_task_switch(struct task_struct *tsk); #else static inline void tick_nohz_init(void) { } -static inline int tick_nohz_full_cpu(int cpu) { return 0; } +static inline bool tick_nohz_full_enabled(void) { return false; } +static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b28dee43e644..0b7887389bd2 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -149,7 +149,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) } #ifdef CONFIG_NO_HZ_FULL -static cpumask_var_t tick_nohz_full_mask; +cpumask_var_t tick_nohz_full_mask; bool tick_nohz_full_running; static bool can_stop_full_tick(void) @@ -270,14 +270,6 @@ out: local_irq_restore(flags); } -int tick_nohz_full_cpu(int cpu) -{ - if (!tick_nohz_full_running) - return 0; - - return cpumask_test_cpu(cpu, tick_nohz_full_mask); -} - /* Parse the boot-time nohz CPU list from the kernel parameters. */ static int __init tick_nohz_full_setup(char *str) { @@ -359,8 +351,6 @@ void __init tick_nohz_init(void) cpulist_scnprintf(nohz_full_buf, sizeof(nohz_full_buf), tick_nohz_full_mask); pr_info("NO_HZ: Full dynticks CPUs: %s.\n", nohz_full_buf); } -#else -#define tick_nohz_full_running (0) #endif /* @@ -738,7 +728,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) return false; } - if (tick_nohz_full_running) { + if (tick_nohz_full_enabled()) { /* * Keep the tick alive to guarantee timekeeping progression * if there are full dynticks CPUs around -- cgit v1.3-8-gc7d7 From d13508f9440e46dccac6a2dd48d51a73b2207482 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 24 Jul 2013 23:52:27 +0200 Subject: nohz: Optimize full dynticks's sched hooks with static keys Scheduler IPIs and task context switches are serious fast path. Let's try to hide as much as we can the impact of full dynticks APIs' off case that are called on these sites through the use of static keys. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman --- include/linux/tick.h | 20 ++++++++++++++++---- kernel/time/tick-sched.c | 8 ++++---- 2 files changed, 20 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index c60b079e1b37..a7ef1d6fceb6 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -180,20 +180,32 @@ static inline bool tick_nohz_full_cpu(int cpu) } extern void tick_nohz_init(void); -extern void tick_nohz_full_check(void); +extern void __tick_nohz_full_check(void); extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_all(void); -extern void tick_nohz_task_switch(struct task_struct *tsk); +extern void __tick_nohz_task_switch(struct task_struct *tsk); #else static inline void tick_nohz_init(void) { } static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } -static inline void tick_nohz_full_check(void) { } +static inline void __tick_nohz_full_check(void) { } static inline void tick_nohz_full_kick(void) { } static inline void tick_nohz_full_kick_all(void) { } -static inline void tick_nohz_task_switch(struct task_struct *tsk) { } +static inline void __tick_nohz_task_switch(struct task_struct *tsk) { } #endif +static inline void tick_nohz_full_check(void) +{ + if (tick_nohz_full_enabled()) + __tick_nohz_full_check(); +} + +static inline void tick_nohz_task_switch(struct task_struct *tsk) +{ + if (tick_nohz_full_enabled()) + __tick_nohz_task_switch(tsk); +} + # ifdef CONFIG_CPU_IDLE_GOV_MENU extern void menu_hrtimer_cancel(void); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 0b7887389bd2..0ff6ae710161 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -198,7 +198,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now); * Re-evaluate the need for the tick on the current CPU * and restart it if necessary. */ -void tick_nohz_full_check(void) +void __tick_nohz_full_check(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); @@ -212,7 +212,7 @@ void tick_nohz_full_check(void) static void nohz_full_kick_work_func(struct irq_work *work) { - tick_nohz_full_check(); + __tick_nohz_full_check(); } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { @@ -231,7 +231,7 @@ void tick_nohz_full_kick(void) static void nohz_full_kick_ipi(void *info) { - tick_nohz_full_check(); + __tick_nohz_full_check(); } /* @@ -254,7 +254,7 @@ void tick_nohz_full_kick_all(void) * It might need the tick due to per task/process properties: * perf events, posix cpu timers, ... */ -void tick_nohz_task_switch(struct task_struct *tsk) +void __tick_nohz_task_switch(struct task_struct *tsk) { unsigned long flags; -- cgit v1.3-8-gc7d7 From fd5e2aa8653665ae1cc60f7aca1069abdbcad3f6 Mon Sep 17 00:00:00 2001 From: Dwight Engen Date: Thu, 15 Aug 2013 14:08:00 -0400 Subject: xfs: ioctl check for capabilities in the current user namespace Use inode_capable() to check if SUID|SGID bits should be cleared to match similar check in inode_change_ok(). The check for CAP_LINUX_IMMUTABLE was not modified since all other file systems also check against init_user_ns rather than current_user_ns. Only allow changing of projid from init_user_ns. Reviewed-by: Dave Chinner Reviewed-by: Gao feng Signed-off-by: Dwight Engen Signed-off-by: Ben Myers --- fs/xfs/xfs_ioctl.c | 11 +++++++++-- kernel/capability.c | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index e9c17e2ed6d7..999c1efd6af5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1015,15 +1015,22 @@ xfs_ioctl_setattr( * to the file owner ID, except in cases where the * CAP_FSETID capability is applicable. */ - if (current_fsuid() != ip->i_d.di_uid && !capable(CAP_FOWNER)) { + if (!inode_owner_or_capable(VFS_I(ip))) { code = XFS_ERROR(EPERM); goto error_return; } /* * Do a quota reservation only if projid is actually going to change. + * Only allow changing of projid from init_user_ns since it is a + * non user namespace aware identifier. */ if (mask & FSX_PROJID) { + if (current_user_ns() != &init_user_ns) { + code = XFS_ERROR(EINVAL); + goto error_return; + } + if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && xfs_get_projid(ip) != fa->fsx_projid) { @@ -1137,7 +1144,7 @@ xfs_ioctl_setattr( * cleared upon successful return from chown() */ if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) && - !capable(CAP_FSETID)) + !inode_capable(VFS_I(ip), CAP_FSETID)) ip->i_d.di_mode &= ~(S_ISUID|S_ISGID); /* diff --git a/kernel/capability.c b/kernel/capability.c index f6c2ce5701e1..a4b67446dc87 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -464,3 +464,4 @@ bool inode_capable(const struct inode *inode, int cap) return ns_capable(ns, cap) && kuid_has_mapping(ns, inode->i_uid); } +EXPORT_SYMBOL(inode_capable); -- cgit v1.3-8-gc7d7 From 930913a31289202d232186b82854b26d7fb7cf4d Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 16 Aug 2013 17:57:14 +0800 Subject: cgroup: use css_get() in cgroup_create() to check CSS_ROOT It seems that the root css doesn't have refcnt allocated(not needed?), and would cause the booting error attached. This patch tries to use css_get() to not increase the refcnt if parent is root. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] cgroup_mkdir+0x37c/0x740 PGD 0 Oops: 0002 [#1] Modules linked in: CPU: 0 PID: 1 Comm: systemd Not tainted 3.11.0-rc5-next-20130815+ #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 task: ffff88007f868000 ti: ffff88007f864000 task.ti: ffff88007f864000 RIP: 0010:[] [] cgroup_mkdir+0x37c/0x740 RSP: 0018:ffff88007f865df8 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffffffff81a46ee0 RCX: 0000000000000001 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff81a415c0 RBP: ffff88007f865ec8 R08: 0000000000000001 R09: 0000000000000000 R10: ffff88007ce6d060 R11: 0000000000000000 R12: ffff88007ce6d000 R13: ffff88007ce6d060 R14: ffffffff81a46d80 R15: ffff88007c6e8018 FS: 00007f13dbf6f840(0000) GS:ffffffff81a23000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000007b7e5000 CR4: 00000000000006b0 Stack: ffffffff810b380d 0000000000000002 ffff88007f865e18 ffffffff81167069 ffff88007f865ed8 ffffffff8116a3f5 ffff880037454400 ffff88007c6e8018 ffff88007c6e8028 ffff88007c6e8328 ffff88007c6e8000 ffff88007ce6d000 Call Trace: [] ? cgroup_mkdir+0x3bd/0x740 [] ? lookup_hash+0x19/0x20 [] ? kern_path_create+0x95/0x170 [] vfs_mkdir+0x9e/0xf0 [] SyS_mkdirat+0x60/0xe0 [] SyS_mkdir+0x19/0x20 [] tracesys+0xcf/0xd4 Code: ad 70 ff ff ff 48 89 9d 60 ff ff ff 4d 89 d5 4c 8b bd 68 ff ff ff 4c 8b 65 88 eb 50 0f 1f 00 48 8b 43 18 a8 03 0f 85 6c 03 00 00 00 e8 1d 0a fb ff 85 c0 74 0d 80 3d f0 45 a1 00 00 0f 84 4c RIP [] cgroup_mkdir+0x37c/0x740 RSP CR2: 0000000000000000 ---[ end trace a4b14b49bc46fd60 ]--- Signed-off-by: Li Zhong Acked-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 66d01078eebe..b69b572131e5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4494,7 +4494,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, struct cgroup_subsys_state *css = css_ar[ss->subsys_id]; dget(dentry); - percpu_ref_get(&css->parent->refcnt); + css_get(css->parent); } /* hold a ref to the parent's dentry */ -- cgit v1.3-8-gc7d7 From c8d2d47a9cbb4222ae4e993aa0e3703430c8193c Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Tue, 6 Aug 2013 20:06:42 +0800 Subject: cpumask: Fix cpumask leak in partition_sched_domains() If doms_new is NULL, partition_sched_domains() will reset ndoms_cur to 0, and free old sched domains with free_sched_domains(doms_cur, ndoms_cur). As ndoms_cur is 0, the cpumask will not be freed. Signed-off-by: Xiaotian Feng Cc: Rusty Russell Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375790802-11857-1-git-send-email-xtfeng@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7415cfdd7de..cf8f100433e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6184,8 +6184,9 @@ match1: ; } + n = ndoms_cur; if (doms_new == NULL) { - ndoms_cur = 0; + n = 0; doms_new = &fallback_doms; cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map); WARN_ON_ONCE(dattr_new); @@ -6193,7 +6194,7 @@ match1: /* Build new domains */ for (i = 0; i < ndoms_new; i++) { - for (j = 0; j < ndoms_cur && !new_topology; j++) { + for (j = 0; j < n && !new_topology; j++) { if (cpumask_equal(doms_new[i], doms_cur[j]) && dattrs_equal(dattr_new, i, dattr_cur, j)) goto match2; -- cgit v1.3-8-gc7d7 From a4f61cc03e443647211a5ae0ab8f8cda2e9e1043 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 7 Aug 2013 15:38:24 +0000 Subject: sched/cputime: Use this_cpu_add() in task_group_account_field() Use of a this_cpu() operation reduces the number of instructions used for accounting (account_user_time()) and frees up some registers. This is in the scheduler tick hotpath. Signed-off-by: Christoph Lameter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/00000140596dd165-338ff7f5-893b-4fec-b251-aaac5557239e-000000@email.amazonses.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a7959e05a9d5..e89ccefef278 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -121,7 +121,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, * is the only cgroup, then nothing else should be necessary. * */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + __this_cpu_add(kernel_cpustat.cpustat[index], tmp); cpuacct_account_field(p, index, tmp); } -- cgit v1.3-8-gc7d7 From c2e7fcf53c3cb02b4ada1c66a9bc8a4d97d58aba Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 2 Aug 2013 18:29:56 +0200 Subject: nohz: Include local CPU in full dynticks global kick tick_nohz_full_kick_all() is useful to notify all full dynticks CPUs that there is a system state change to checkout before re-evaluating the need for the tick. Unfortunately this is implemented using smp_call_function_many() that ignores the local CPU. This CPU also needs to re-evaluate the tick. on_each_cpu_mask() is not useful either because we don't want to re-evaluate the tick state in place but asynchronously from an IPI to avoid messing up with any random locking scenario. So lets call tick_nohz_full_kick() from tick_nohz_full_kick_all() so that the usual irq work takes care of it. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Cc: Kevin Hilman Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375460996-16329-4-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index adea6fc3ba2a..3612fc77f834 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -246,6 +246,7 @@ void tick_nohz_full_kick_all(void) preempt_disable(); smp_call_function_many(tick_nohz_full_mask, nohz_full_kick_ipi, NULL, false); + tick_nohz_full_kick(); preempt_enable(); } -- cgit v1.3-8-gc7d7 From fc3b86d673e41ac66b4ba5b75a90c2fcafb90089 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 2 Aug 2013 18:29:54 +0200 Subject: perf: Roll back callchain buffer refcount under the callchain mutex When we fail to allocate the callchain buffers, we roll back the refcount we did and return from get_callchain_buffers(). However we take the refcount and allocate under the callchain lock but the rollback is done outside the lock. As a result, while we roll back, some concurrent callchain user may call get_callchain_buffers(), see the non-zero refcount and give up because the buffers are NULL without itself retrying the allocation. The consequences aren't that bad but that behaviour looks weird enough and it's better to give their chances to the following callchain users where we failed. Reported-by: Jiri Olsa Signed-off-by: Frederic Weisbecker Acked-by: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375460996-16329-2-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/callchain.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index 76a8bc5f6265..97b67df8fbfe 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -116,10 +116,11 @@ int get_callchain_buffers(void) err = alloc_callchain_buffers(); exit: - mutex_unlock(&callchain_mutex); if (err) atomic_dec(&nr_callchain_events); + mutex_unlock(&callchain_mutex); + return err; } -- cgit v1.3-8-gc7d7 From 948b26b6ddd08a57cb95ebb0dc96fde2edd5c383 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 2 Aug 2013 18:29:55 +0200 Subject: perf: Account freq events globally Freq events may not always be affine to a particular CPU. As such, account_event_cpu() may crash if we account per cpu a freq event that has event->cpu == -1. To solve this, lets account freq events globally. In practice this doesn't change much the picture because perf tools create per-task perf events with one event per CPU by default. Profiling a single CPU is usually a corner case so there is no much point in optimizing things that way. Reported-by: Jiri Olsa Suggested-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Tested-by: Jiri Olsa Cc: Namhyung Kim Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375460996-16329-3-git-send-email-fweisbec@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e82e70025d42..2e675e830976 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -141,11 +141,11 @@ enum event_type_t { struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); -static DEFINE_PER_CPU(atomic_t, perf_freq_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +static atomic_t nr_freq_events __read_mostly; static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); @@ -1871,9 +1871,6 @@ static int __perf_install_in_context(void *info) perf_pmu_enable(cpuctx->ctx.pmu); perf_ctx_unlock(cpuctx, task_ctx); - if (atomic_read(&__get_cpu_var(perf_freq_events))) - tick_nohz_full_kick(); - return 0; } @@ -2811,7 +2808,7 @@ done: #ifdef CONFIG_NO_HZ_FULL bool perf_event_can_stop_tick(void) { - if (atomic_read(&__get_cpu_var(perf_freq_events)) || + if (atomic_read(&nr_freq_events) || __this_cpu_read(perf_throttled_count)) return false; else @@ -3140,9 +3137,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) } if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); - - if (event->attr.freq) - atomic_dec(&per_cpu(perf_freq_events, cpu)); } static void unaccount_event(struct perf_event *event) @@ -3158,6 +3152,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); + if (event->attr.freq) + atomic_dec(&nr_freq_events); if (is_cgroup_event(event)) static_key_slow_dec_deferred(&perf_sched_events); if (has_branch_stack(event)) @@ -6489,9 +6485,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) } if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); - - if (event->attr.freq) - atomic_inc(&per_cpu(perf_freq_events, cpu)); } static void account_event(struct perf_event *event) @@ -6507,6 +6500,10 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (event->attr.freq) { + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_full_kick_all(); + } if (has_branch_stack(event)) static_key_slow_inc(&perf_sched_events.key); if (is_cgroup_event(event)) -- cgit v1.3-8-gc7d7 From 5ec4c599a52362896c3e7c6a31ba6145dca9c6f5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Aug 2013 21:16:30 +0200 Subject: perf: Do not compute time values unnecessarily We should not be calling calc_timer_values() for events that do not actually have an mmap()'ed userpage. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130802191630.GT27162@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2e675e830976..928fae7ca8c7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3670,6 +3670,10 @@ void perf_event_update_userpage(struct perf_event *event) u64 enabled, running, now; rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + /* * compute total_time_enabled, total_time_running * based on snapshot values taken when the event @@ -3680,12 +3684,8 @@ void perf_event_update_userpage(struct perf_event *event) * NMI context */ calc_timer_values(event, &now, &enabled, &running); - rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; userpg = rb->user_page; - /* * Disable preemption so as to not let the corresponding user-space * spin too long if we get preempted. -- cgit v1.3-8-gc7d7 From d1d74d14e98a6be740a6f12456c7d9ad47be9c9c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 22 Apr 2013 00:12:42 +0200 Subject: rcu: Expedite grace periods during suspend/resume MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CONFIG_RCU_FAST_NO_HZ can increase grace-period durations by up to a factor of four, which can result in long suspend and resume times. Thus, this commit temporarily switches to expedited grace periods when suspending the box and return to normal settings when resuming. Similar logic is applied to hibernation. Because expedited grace periods are of dubious benefit on very large systems, so this commit restricts their automated use during suspend and resume to systems of 256 or fewer CPUs. (Some day a number of Linux-kernel facilities, including RCU's expedited grace periods, will be more scalable, but I need to see bug reports first.) [ paulmck: This also papers over an audio/irq bug, but hopefully that will be fixed soon. ] Signed-off-by: Borislav Petkov Signed-off-by: Bjørn Mork Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 338f1d1c1c66..a7bf517b0482 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -54,6 +54,7 @@ #include #include #include +#include #include "rcutree.h" #include @@ -3032,6 +3033,25 @@ static int rcu_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } +static int rcu_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_HIBERNATION_PREPARE: + case PM_SUSPEND_PREPARE: + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_expedited = 1; + break; + case PM_POST_HIBERNATION: + case PM_POST_SUSPEND: + rcu_expedited = 0; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * Spawn the kthread that handles this RCU flavor's grace periods. */ @@ -3273,6 +3293,7 @@ void __init rcu_init(void) * or the scheduler are operational. */ cpu_notifier(rcu_cpu_notify, 0); + pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } -- cgit v1.3-8-gc7d7 From 15100df81fcc3109862f7c03266c0abff4262564 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Apr 2013 11:31:50 -0700 Subject: rcu: Simplify debug-objects fixups The current debug-objects fixups are complex and heavyweight, and the fixups are not complete: Even with the fixups, RCU's callback lists can still be corrupted. This commit therefore strips the fixups down to their minimal form, eliminating two of the three. It would be even better if (for example) call_rcu() simply leaked any problematic callbacks, but for that to happen, the debug-objects system would need to inform its caller of suspicious situations. This is the subject of a later commit in this series. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Sedat Dilek Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Thomas Gleixner Cc: Linus Torvalds Tested-by: Sedat Dilek Reviewed-by: Josh Triplett --- kernel/rcupdate.c | 100 ------------------------------------------------------ 1 file changed, 100 deletions(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 14994d4e1a54..33eb4620aa17 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -211,43 +211,6 @@ static inline void debug_rcu_head_free(struct rcu_head *head) debug_object_free(head, &rcuhead_debug_descr); } -/* - * fixup_init is called when: - * - an active object is initialized - */ -static int rcuhead_fixup_init(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_init(head, &rcuhead_debug_descr); - return 1; - default: - return 0; - } -} - /* * fixup_activate is called when: * - an active object is activated @@ -268,69 +231,8 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) debug_object_init(head, &rcuhead_debug_descr); debug_object_activate(head, &rcuhead_debug_descr); return 0; - - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_activate(head, &rcuhead_debug_descr); - return 1; default: - return 0; - } -} - -/* - * fixup_free is called when: - * - an active object is freed - */ -static int rcuhead_fixup_free(void *addr, enum debug_obj_state state) -{ - struct rcu_head *head = addr; - - switch (state) { - case ODEBUG_STATE_ACTIVE: - /* - * Ensure that queued callbacks are all executed. - * If we detect that we are nested in a RCU read-side critical - * section, we should simply fail, otherwise we would deadlock. - * In !PREEMPT configurations, there is no way to tell if we are - * in a RCU read-side critical section or not, so we never - * attempt any fixup and just print a warning. - */ -#ifndef CONFIG_PREEMPT - WARN_ON_ONCE(1); - return 0; -#endif - if (rcu_preempt_depth() != 0 || preempt_count() != 0 || - irqs_disabled()) { - WARN_ON_ONCE(1); - return 0; - } - rcu_barrier(); - rcu_barrier_sched(); - rcu_barrier_bh(); - debug_object_free(head, &rcuhead_debug_descr); return 1; - default: - return 0; } } @@ -369,9 +271,7 @@ EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); struct debug_obj_descr rcuhead_debug_descr = { .name = "rcu_head", - .fixup_init = rcuhead_fixup_init, .fixup_activate = rcuhead_fixup_activate, - .fixup_free = rcuhead_fixup_free, }; EXPORT_SYMBOL_GPL(rcuhead_debug_descr); #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -- cgit v1.3-8-gc7d7 From ae15018456c44b742d352af323e0b89eae4a6383 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Apr 2013 13:20:57 -0700 Subject: rcu: Make call_rcu() leak callbacks for debug-object errors If someone does a duplicate call_rcu(), the worst thing the second call_rcu() could do would be to actually queue the callback the second time because doing so corrupts whatever list the callback was already queued on. This commit therefore makes __call_rcu() check the new return value from debug-objects and leak the callback upon error. This commit also substitutes rcu_leak_callback() for whatever callback function was previously in place in order to avoid freeing the callback out from under any readers that might still be referencing it. These changes increase the probability that the debug-objects error messages will actually make it somewhere visible. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Sedat Dilek Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Thomas Gleixner Cc: Linus Torvalds Tested-by: Sedat Dilek Reviewed-by: Josh Triplett --- kernel/rcu.h | 10 +++++++--- kernel/rcutree.c | 14 +++++++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu.h b/kernel/rcu.h index 0a90ccc65bfb..77131966c4ad 100644 --- a/kernel/rcu.h +++ b/kernel/rcu.h @@ -67,12 +67,15 @@ extern struct debug_obj_descr rcuhead_debug_descr; -static inline void debug_rcu_head_queue(struct rcu_head *head) +static inline int debug_rcu_head_queue(struct rcu_head *head) { - debug_object_activate(head, &rcuhead_debug_descr); + int r1; + + r1 = debug_object_activate(head, &rcuhead_debug_descr); debug_object_active_state(head, &rcuhead_debug_descr, STATE_RCU_HEAD_READY, STATE_RCU_HEAD_QUEUED); + return r1; } static inline void debug_rcu_head_unqueue(struct rcu_head *head) @@ -83,8 +86,9 @@ static inline void debug_rcu_head_unqueue(struct rcu_head *head) debug_object_deactivate(head, &rcuhead_debug_descr); } #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ -static inline void debug_rcu_head_queue(struct rcu_head *head) +static inline int debug_rcu_head_queue(struct rcu_head *head) { + return 0; } static inline void debug_rcu_head_unqueue(struct rcu_head *head) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index a7bf517b0482..91840566e294 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2304,6 +2304,13 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, } } +/* + * RCU callback function to leak a callback. + */ +static void rcu_leak_callback(struct rcu_head *rhp) +{ +} + /* * Helper function for call_rcu() and friends. The cpu argument will * normally be -1, indicating "currently running CPU". It may specify @@ -2318,7 +2325,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), struct rcu_data *rdp; WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ - debug_rcu_head_queue(head); + if (debug_rcu_head_queue(head)) { + /* Probable double call_rcu(), so leak the callback. */ + ACCESS_ONCE(head->func) = rcu_leak_callback; + WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); + return; + } head->func = func; head->next = NULL; -- cgit v1.3-8-gc7d7 From 1eafd31c640d6799c63136246a59d608bed93c74 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 20 Jun 2013 13:50:40 -0700 Subject: rcu: Avoid redundant grace-period kthread wakeups When setting up an in-the-future "advanced" grace period, the code needs to wake up the relevant grace-period kthread, which it currently does unconditionally. However, this results in needless wakeups in the case where the advanced grace period is being set up by the grace-period kthread itself, which is a non-uncommon situation. This commit therefore checks to see if the running thread is the grace-period kthread, and avoids doing the irq_work_queue()-mediated wakeup in that case. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 91840566e294..c6a064abd6a0 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1576,10 +1576,12 @@ rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, /* * We can't do wakeups while holding the rnp->lock, as that - * could cause possible deadlocks with the rq->lock. Deter - * the wakeup to interrupt context. + * could cause possible deadlocks with the rq->lock. Defer + * the wakeup to interrupt context. And don't bother waking + * up the running kthread. */ - irq_work_queue(&rsp->wakeup_work); + if (current != rsp->gp_kthread) + irq_work_queue(&rsp->wakeup_work); } /* -- cgit v1.3-8-gc7d7 From feed66ed26a53e700ca02ce1744fed7d0c647292 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 9 May 2013 08:55:54 -0700 Subject: rcu: Eliminate unused APIs intended for adaptive ticks The rcu_user_enter_after_irq() and rcu_user_exit_after_irq() functions were intended for use by adaptive ticks, but changes in implementation have rendered them unnecessary. This commit therefore removes them. Reported-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcupdate.h | 4 ---- kernel/rcutree.c | 43 ------------------------------------------- 2 files changed, 47 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 0c38abbe6e35..30bea9c25735 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -229,13 +229,9 @@ extern void rcu_irq_exit(void); #ifdef CONFIG_RCU_USER_QS extern void rcu_user_enter(void); extern void rcu_user_exit(void); -extern void rcu_user_enter_after_irq(void); -extern void rcu_user_exit_after_irq(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } -static inline void rcu_user_enter_after_irq(void) { } -static inline void rcu_user_exit_after_irq(void) { } static inline void rcu_user_hooks_switch(struct task_struct *prev, struct task_struct *next) { } #endif /* CONFIG_RCU_USER_QS */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 338f1d1c1c66..8807019138c6 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -444,27 +444,6 @@ void rcu_user_enter(void) { rcu_eqs_enter(1); } - -/** - * rcu_user_enter_after_irq - inform RCU that we are going to resume userspace - * after the current irq returns. - * - * This is similar to rcu_user_enter() but in the context of a non-nesting - * irq. After this call, RCU enters into idle mode when the interrupt - * returns. - */ -void rcu_user_enter_after_irq(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - /* Ensure this irq is interrupting a non-idle RCU state. */ - WARN_ON_ONCE(!(rdtp->dynticks_nesting & DYNTICK_TASK_MASK)); - rdtp->dynticks_nesting = 1; - local_irq_restore(flags); -} #endif /* CONFIG_RCU_USER_QS */ /** @@ -581,28 +560,6 @@ void rcu_user_exit(void) { rcu_eqs_exit(1); } - -/** - * rcu_user_exit_after_irq - inform RCU that we won't resume to userspace - * idle mode after the current non-nesting irq returns. - * - * This is similar to rcu_user_exit() but in the context of an irq. - * This is called when the irq has interrupted a userspace RCU idle mode - * context. When the current non-nesting interrupt returns after this call, - * the CPU won't restore the RCU idle mode. - */ -void rcu_user_exit_after_irq(void) -{ - unsigned long flags; - struct rcu_dynticks *rdtp; - - local_irq_save(flags); - rdtp = &__get_cpu_var(rcu_dynticks); - /* Ensure we are interrupting an RCU idle mode. */ - WARN_ON_ONCE(rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK); - rdtp->dynticks_nesting += DYNTICK_TASK_EXIT_IDLE; - local_irq_restore(flags); -} #endif /* CONFIG_RCU_USER_QS */ /** -- cgit v1.3-8-gc7d7 From b44379af1cf40050794832c38ea6a64e07eb5087 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 11:08:45 -0700 Subject: nohz_full: Add Kconfig parameter for scalable detection of all-idle state At least one CPU must keep the scheduling-clock tick running for timekeeping purposes whenever there is a non-idle CPU. However, with the new nohz_full adaptive-idle machinery, it is difficult to distinguish between all CPUs really being idle as opposed to all non-idle CPUs being in adaptive-ticks mode. This commit therefore adds a Kconfig parameter as a first step towards enabling a scalable detection of full-system idle state. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt [ paulmck: Update help text per Frederic Weisbecker. ] Reviewed-by: Josh Triplett --- kernel/time/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 70f27e89012b..c7d2fd67799e 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -134,6 +134,29 @@ config NO_HZ_FULL_ALL Note the boot CPU will still be kept outside the range to handle the timekeeping duty. +config NO_HZ_FULL_SYSIDLE + bool "Detect full-system idle state for full dynticks system" + depends on NO_HZ_FULL + default n + help + At least one CPU must keep the scheduling-clock tick running for + timekeeping purposes whenever there is a non-idle CPU, where + "non-idle" also includes dynticks CPUs as long as they are + running non-idle tasks. Because the underlying adaptive-tick + support cannot distinguish between all CPUs being idle and + all CPUs each running a single task in dynticks mode, the + underlying support simply ensures that there is always a CPU + handling the scheduling-clock tick, whether or not all CPUs + are idle. This Kconfig option enables scalable detection of + the all-CPUs-idle state, thus allowing the scheduling-clock + tick to be disabled when all CPUs are idle. Note that scalable + detection of the all-CPUs-idle state means that larger systems + will be slower to declare the all-CPUs-idle state. + + Say Y if you would like to help debug all-CPUs-idle detection. + + Say N if you are unsure. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS -- cgit v1.3-8-gc7d7 From 2333210b26cf7aaf48d71343029afb860103d9f9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 12:34:33 -0700 Subject: nohz_full: Add rcu_dyntick data for scalable detection of all-idle state This commit adds fields to the rcu_dyntick structure that are used to detect idle CPUs. These new fields differ from the existing ones in that the existing ones consider a CPU executing in user mode to be idle, where the new ones consider CPUs executing in user mode to be busy. The handling of these new fields is otherwise quite similar to that for the exiting fields. This commit also adds the initialization required for these fields. So, why is usermode execution treated differently, with RCU considering it a quiescent state equivalent to idle, while in contrast the new full-system idle state detection considers usermode execution to be non-idle? It turns out that although one of RCU's quiescent states is usermode execution, it is not a full-system idle state. This is because the purpose of the full-system idle state is not RCU, but rather determining when accurate timekeeping can safely be disabled. Whenever accurate timekeeping is required in a CONFIG_NO_HZ_FULL kernel, at least one CPU must keep the scheduling-clock tick going. If even one CPU is executing in user mode, accurate timekeeping is requires, particularly for architectures where gettimeofday() and friends do not enter the kernel. Only when all CPUs are really and truly idle can accurate timekeeping be disabled, allowing all CPUs to turn off the scheduling clock interrupt, thus greatly improving energy efficiency. This naturally raises the question "Why is this code in RCU rather than in timekeeping?", and the answer is that RCU has the data and infrastructure to efficiently make this determination. Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Cc: Steven Rostedt Reviewed-by: Josh Triplett --- kernel/rcutree.c | 5 +++++ kernel/rcutree.h | 9 +++++++++ kernel/rcutree_plugin.h | 19 +++++++++++++++++++ 3 files changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 8807019138c6..4f27b85d8c86 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -224,6 +224,10 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, .dynticks = ATOMIC_INIT(1), +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, + .dynticks_idle = ATOMIC_INIT(1), +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ @@ -2904,6 +2908,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) rdp->blimit = blimit; init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; + rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index cbdeac6cea9e..52d1be108e75 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -88,6 +88,14 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + long long dynticks_idle_nesting; + /* irq/process nesting level from idle. */ + atomic_t dynticks_idle; /* Even value for idle, else odd. */ + /* "Idle" excludes userspace execution. */ + unsigned long dynticks_idle_jiffies; + /* End of last non-NMI non-idle period. */ +#endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ #ifdef CONFIG_RCU_FAST_NO_HZ bool all_lazy; /* Are all CPU's CBs lazy? */ unsigned long nonlazy_posted; @@ -545,6 +553,7 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index dff86f53ee09..e5baccbd8038 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2373,3 +2373,22 @@ static void rcu_kick_nohz_cpu(int cpu) smp_send_reschedule(cpu); #endif /* #ifdef CONFIG_NO_HZ_FULL */ } + + +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE + +/* + * Initialize dynticks sysidle state for CPUs coming online. + */ +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ + rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; +} + +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ -- cgit v1.3-8-gc7d7 From eb348b898290da242e46df75ab0b9772003e08b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 13:00:57 -0700 Subject: nohz_full: Add per-CPU idle-state tracking This commit adds the code that updates the rcu_dyntick structure's new fields to track the per-CPU idle state based on interrupts and transitions into and out of the idle loop (NMIs are ignored because NMI handlers cannot cleanly read out the time anyway). This code is similar to the code that maintains RCU's idea of per-CPU idleness, but differs in that RCU treats CPUs running in user mode as idle, where this new code does not. Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Cc: Steven Rostedt Reviewed-by: Josh Triplett --- kernel/rcutree.c | 4 +++ kernel/rcutree.h | 2 ++ kernel/rcutree_plugin.h | 79 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 85 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 4f27b85d8c86..b0d2cc3ea15a 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -431,6 +431,7 @@ void rcu_idle_enter(void) local_irq_save(flags); rcu_eqs_enter(false); + rcu_sysidle_enter(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_enter); @@ -481,6 +482,7 @@ void rcu_irq_exit(void) trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); else rcu_eqs_enter_common(rdtp, oldval, true); + rcu_sysidle_enter(rdtp, 1); local_irq_restore(flags); } @@ -549,6 +551,7 @@ void rcu_idle_exit(void) local_irq_save(flags); rcu_eqs_exit(false); + rcu_sysidle_exit(&__get_cpu_var(rcu_dynticks), 0); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(rcu_idle_exit); @@ -600,6 +603,7 @@ void rcu_irq_enter(void) trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); else rcu_eqs_exit_common(rdtp, oldval, true); + rcu_sysidle_exit(rdtp, 1); local_irq_restore(flags); } diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 52d1be108e75..9dd8b177f1ac 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -553,6 +553,8 @@ static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index e5baccbd8038..eab81da614b8 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2377,6 +2377,77 @@ static void rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE +/* + * Invoked to note exit from irq or task transition to idle. Note that + * usermode execution does -not- count as idle here! After all, we want + * to detect full-system idle states, not RCU quiescent states and grace + * periods. The caller must have disabled interrupts. + */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ + unsigned long j; + + /* Adjust nesting, check for fully idle. */ + if (irq) { + rdtp->dynticks_idle_nesting--; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + if (rdtp->dynticks_idle_nesting != 0) + return; /* Still not fully idle. */ + } else { + if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == + DYNTICK_TASK_NEST_VALUE) { + rdtp->dynticks_idle_nesting = 0; + } else { + rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); + return; /* Still not fully idle. */ + } + } + + /* Record start of fully idle period. */ + j = jiffies; + ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); +} + +/* + * Invoked to note entry to irq or task transition from idle. Note that + * usermode execution does -not- count as idle here! The caller must + * have disabled interrupts. + */ +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ + /* Adjust nesting, check for already non-idle. */ + if (irq) { + rdtp->dynticks_idle_nesting++; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + if (rdtp->dynticks_idle_nesting != 1) + return; /* Already non-idle. */ + } else { + /* + * Allow for irq misnesting. Yes, it really is possible + * to enter an irq handler then never leave it, and maybe + * also vice versa. Handle both possibilities. + */ + if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { + rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; + WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); + return; /* Already non-idle. */ + } else { + rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; + } + } + + /* Record end of idle period. */ + smp_mb__before_atomic_inc(); + atomic_inc(&rdtp->dynticks_idle); + smp_mb__after_atomic_inc(); + WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); +} + /* * Initialize dynticks sysidle state for CPUs coming online. */ @@ -2387,6 +2458,14 @@ static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ +static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) +{ +} + +static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) +{ +} + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) { } -- cgit v1.3-8-gc7d7 From d4bd54fbac2ea5c30eb976ca557e905f489d55f4 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 14:51:40 -0700 Subject: nohz_full: Add full-system idle states and variables This commit adds control variables and states for full-system idle. The system will progress through the states in numerical order when the system is fully idle (other than the timekeeping CPU), and reset down to the initial state if any non-timekeeping CPU goes non-idle. The current state is kept in full_sysidle_state. One flavor of RCU will be in charge of driving the state machine, defined by rcu_sysidle_state. This should be the busiest flavor of RCU. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt Reviewed-by: Josh Triplett --- kernel/rcutree_plugin.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index eab81da614b8..a7419ceb19ad 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2377,6 +2377,23 @@ static void rcu_kick_nohz_cpu(int cpu) #ifdef CONFIG_NO_HZ_FULL_SYSIDLE +/* + * Define RCU flavor that holds sysidle state. This needs to be the + * most active flavor of RCU. + */ +#ifdef CONFIG_PREEMPT_RCU +static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state; +#else /* #ifdef CONFIG_PREEMPT_RCU */ +static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state; +#endif /* #else #ifdef CONFIG_PREEMPT_RCU */ + +static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ +#define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ +#define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ +#define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ +#define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ +#define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ + /* * Invoked to note exit from irq or task transition to idle. Note that * usermode execution does -not- count as idle here! After all, we want -- cgit v1.3-8-gc7d7 From 217af2a2ffbfc1498d1cf3a89fa478b5632df8f7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 15:39:06 -0700 Subject: nohz_full: Add full-system-idle arguments to API This commit adds an isidle and jiffies argument to force_qs_rnp(), dyntick_save_progress_counter(), and rcu_implicit_dynticks_qs() to enable RCU's force-quiescent-state process to check for full-system idle. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Lai Jiangshan [ paulmck: Use true and false for boolean constants per Lai Jiangshan. ] Reviewed-by: Josh Triplett --- kernel/rcutree.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index b0d2cc3ea15a..7b5be56d95ae 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -246,7 +246,10 @@ module_param(jiffies_till_next_fqs, ulong, 0644); static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp); -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)); +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj); static void force_quiescent_state(struct rcu_state *rsp); static int rcu_pending(int cpu); @@ -727,7 +730,8 @@ static int rcu_is_cpu_rrupt_from_idle(void) * credit them with an implicit quiescent state. Return 1 if this CPU * is in dynticks idle mode, which is an extended quiescent state. */ -static int dyntick_save_progress_counter(struct rcu_data *rdp) +static int dyntick_save_progress_counter(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); return (rdp->dynticks_snap & 0x1) == 0; @@ -739,7 +743,8 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp) * idle state since the last call to dyntick_save_progress_counter() * for this same CPU, or by virtue of having been offline. */ -static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) +static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, + bool *isidle, unsigned long *maxj) { unsigned int curr; unsigned int snap; @@ -1361,16 +1366,19 @@ static int rcu_gp_init(struct rcu_state *rsp) int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) { int fqs_state = fqs_state_in; + bool isidle = false; + unsigned long maxj; struct rcu_node *rnp = rcu_get_root(rsp); rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ - force_qs_rnp(rsp, dyntick_save_progress_counter); + force_qs_rnp(rsp, dyntick_save_progress_counter, + &isidle, &maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - force_qs_rnp(rsp, rcu_implicit_dynticks_qs); + force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { @@ -2069,7 +2077,10 @@ void rcu_check_callbacks(int cpu, int user) * * The caller must have suppressed start of new grace periods. */ -static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) +static void force_qs_rnp(struct rcu_state *rsp, + int (*f)(struct rcu_data *rsp, bool *isidle, + unsigned long *maxj), + bool *isidle, unsigned long *maxj) { unsigned long bit; int cpu; @@ -2093,7 +2104,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu))) + f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } if (mask != 0) { -- cgit v1.3-8-gc7d7 From 15e71911fcc655508e02f767a3d9b8b138051d2b Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Mon, 29 Jul 2013 11:52:24 +0800 Subject: generic-ipi/locking: Fix misleading smp_call_function_any() description Fix locking description: after commit 8969a5ede0f9e17da4b9437 ("generic-ipi: remove kmalloc()"), wait = 0 can be guaranteed because we don't kmalloc() anymore. Signed-off-by: Xie XiuQi Cc: Sheng Yang Cc: Peter Zijlstra Cc: Jens Axboe Cc: Rusty Russell Link: http://lkml.kernel.org/r/51F5E6F8.1000801@huawei.com Signed-off-by: Ingo Molnar --- kernel/smp.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index fe9f773d7114..b1c9034bdfcb 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -278,8 +278,6 @@ EXPORT_SYMBOL(smp_call_function_single); * @wait: If true, wait until function has completed. * * Returns 0 on success, else a negative status code (if no cpus were online). - * Note that @wait will be implicitly turned on in case of allocation failures, - * since we fall back to on-stack allocation. * * Selection preference: * 1) current cpu if in @mask -- cgit v1.3-8-gc7d7 From 1cb650b91ba582f6737457b7d22e368585596d2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 19 Aug 2013 10:05:24 +0800 Subject: cgroup: change cgroup_from_id() to css_from_id() Now we want cgroup core to always provide the css to use to the subsystems, so change this API to css_from_id(). Uninline css_from_id(), because it's getting bigger and cgroup_css() has been unexported. While at it, remove the #ifdef, and shuffle the order of the args. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- include/linux/cgroup.h | 20 ++------------------ kernel/cgroup.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c24bd0b9f93a..b685955d4b29 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -741,27 +741,11 @@ static inline struct cgroup *task_cgroup(struct task_struct *task, return task_css(task, subsys_id)->cgroup; } -/** - * cgroup_from_id - lookup cgroup by id - * @ss: cgroup subsys to be looked into - * @id: the cgroup id - * - * Returns the cgroup if there's valid one with @id, otherwise returns NULL. - * Should be called under rcu_read_lock(). - */ -static inline struct cgroup *cgroup_from_id(struct cgroup_subsys *ss, int id) -{ -#ifdef CONFIG_PROVE_RCU - rcu_lockdep_assert(rcu_read_lock_held() || - lockdep_is_held(&cgroup_mutex), - "cgroup_from_id() needs proper protection"); -#endif - return idr_find(&ss->root->cgroup_idr, id); -} - struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); + /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b69b572131e5..ff7d642a070a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5717,6 +5717,28 @@ struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) return css ? css : ERR_PTR(-ENOENT); } +/** + * css_from_id - lookup css by id + * @id: the cgroup id + * @ss: cgroup subsys to be looked into + * + * Returns the css if there's valid one with @id, otherwise returns NULL. + * Should be called under rcu_read_lock(). + */ +struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) +{ + struct cgroup *cgrp; + + rcu_lockdep_assert(rcu_read_lock_held() || + lockdep_is_held(&cgroup_mutex), + "css_from_id() needs proper protection"); + + cgrp = idr_find(&ss->root->cgroup_idr, id); + if (cgrp) + return cgroup_css(cgrp, ss->subsys_id); + return NULL; +} + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) -- cgit v1.3-8-gc7d7 From 0bfb4aa67cef4982adc70590a31624d7b35a0bda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:42:36 -0400 Subject: cgroup: fix subsystem file accesses on the root cgroup 105347ba5 ("cgroup: make cgroup_file_open() rcu_read_lock() around cgroup_css() and add cfent->css") added cfent->css to cache the associted cgroup_subsys_state across file operations. A cfent is associated with single css throughout its lifetime and the origimal commit initialized the cache pointer during cgroup_add_file() and verified that it matches the actual one in cgroup_file_open(). While this works fine for !root cgroups, it's broken for root cgroups as files in a root cgroup are created before the css's are associated with the cgroup and thus cgroup_css() call in cgroup_add_file() returns NULL associating all cfents in the root cgroup with NULL css. This makes cgroup_file_open() trigger WARN and fail with -ENODEV for all !core subsystem files in the root cgroups. There's no reason to initialize cfent->css separately from cgroup_add_file(). As the association never changes, cgroup_file_open() can set it unconditionally every time and containing the logic in cgroup_file_open() makes more sense anyway as the only reason it's necessary is file->private_data being already occupied. Fix it by setting cfent->css unconditionally from cgroup_file_open(). Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ff7d642a070a..896e035eb6e4 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2490,10 +2490,18 @@ static int cgroup_file_open(struct inode *inode, struct file *file) } rcu_read_unlock(); - /* css should match @cfe->css, see cgroup_add_file() for details */ - if (!css || WARN_ON_ONCE(css != cfe->css)) + if (!css) return -ENODEV; + /* + * @cfe->css is used by read/write/close to determine the + * associated css. @file->private_data would be a better place but + * that's already used by seqfile. Multiple accessors may use it + * simultaneously which is okay as the association never changes. + */ + WARN_ON_ONCE(cfe->css && cfe->css != css); + cfe->css = css; + if (cft->read_map || cft->read_seq_string) { file->f_op = &cgroup_seqfile_operations; err = single_open(file, cgroup_seqfile_show, cfe); @@ -2772,18 +2780,6 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) dentry->d_fsdata = cfe; simple_xattrs_init(&cfe->xattrs); - /* - * cfe->css is used by read/write/close to determine the associated - * css. file->private_data would be a better place but that's - * already used by seqfile. Note that open will use the usual - * cgroup_css() and css_tryget() to acquire the css and this - * caching doesn't affect css lifetime management. - */ - if (cft->ss) - cfe->css = cgroup_css(cgrp, cft->ss->subsys_id); - else - cfe->css = &cgrp->dummy_css; - mode = cgroup_file_mode(cft); error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); if (!error) { -- cgit v1.3-8-gc7d7 From 6e6eab0efdf48fb2d8d7aee904d7740acb4661c6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 15 Aug 2013 11:43:15 -0400 Subject: cgroup: fix cgroup_write_event_control() 81eeaf0411 ("cgroup: make cftype->[un]register_event() deal with cgroup_subsys_state inst ead of cgroup") updated the cftype event methods to take @css (cgroup_subsys_state) instead of @cgroup; however, it incorrectly used @css passed to cgroup_write_event_control(), which the dummy_css for the cgroup as the file is a cgroup core file. This leads to oops on event registration. Fix it by using the css matching the event target file. Note that cgroup_write_event_control() now disallows cgroup core files from being event sources. This is for simplicity and doesn't matter as cgroup_event will be moved and made specific to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 896e035eb6e4..ef43e3f453ef 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4040,10 +4040,10 @@ static void cgroup_event_ptable_queue_proc(struct file *file, * Input must be in format ' '. * Interpretation of args is defined by control file implementation. */ -static int cgroup_write_event_control(struct cgroup_subsys_state *css, +static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, struct cftype *cft, const char *buffer) { - struct cgroup *cgrp = css->cgroup; + struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; struct cgroup *cgrp_cfile; unsigned int efd, cfd; @@ -4065,7 +4065,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return -ENOMEM; - event->css = css; + INIT_LIST_HEAD(&event->list); init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); init_waitqueue_func_entry(&event->wait, cgroup_event_wake); @@ -4101,6 +4101,23 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } + if (!event->cft->ss) { + ret = -EBADF; + goto out_put_cfile; + } + + /* determine the css of @cfile and associate @event with it */ + rcu_read_lock(); + + ret = -EINVAL; + event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + if (event->css) + ret = 0; + + rcu_read_unlock(); + if (ret) + goto out_put_cfile; + /* * The file to be monitored must be in the same cgroup as * cgroup.event_control is. @@ -4116,7 +4133,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *css, goto out_put_cfile; } - ret = event->cft->register_event(css, event->cft, + ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) goto out_put_cfile; -- cgit v1.3-8-gc7d7 From 79ac6834c255d9e3832209f3738d6bff7b87c743 Mon Sep 17 00:00:00 2001 From: Christoph Jaeger Date: Tue, 20 Aug 2013 15:33:18 +0930 Subject: module: fix sprintf format specifier in param_get_byte() In param_get_byte(), to which the macro STANDARD_PARAM_DEF(byte, ...) expands, "%c" is used to print an unsigned char. So it gets printed as a character what is not intended here. Use "%hhu" instead. [Rusty: note drivers which would be effected: drivers/net/wireless/cw1200/main.c drivers/ntb/ntb_transport.c:68 drivers/scsi/lpfc/lpfc_attr.c drivers/usb/atm/speedtch.c drivers/usb/gadget/g_ffs.c ] Acked-by: Jon Mason (for ntb) Acked-by: Michal Nazarewicz (for g_ffs.c) Signed-off-by: Christoph Jaeger Signed-off-by: Rusty Russell --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 440e65d1a544..59f7ac7bec04 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -252,7 +252,7 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%c", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); -- cgit v1.3-8-gc7d7 From ab013c5f60b7ead254863c75b9adc2a47992d01b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 20 Aug 2013 15:33:19 +0930 Subject: module: Add flag to allow mod params to have no arguments Currently the params.c code allows only two "set" functions to have no arguments. If a parameter does not have an argument, then it looks at the set function and tests if it is either param_set_bool() or param_set_bint(). If it is not one of these functions, then it fails the loading of the module. But there may be module parameters that have different set functions and still allow no arguments. But unless each of these cases adds their function to the if statement, it wont be allowed to have no arguments. This method gets rather messing and does not scale. Instead, introduce a flags field to the kernel_param_ops, where if the flag KERNEL_PARAM_FL_NOARG is set, the parameter will not fail if it does not contain an argument. It will be expected that the corresponding set function can handle a NULL pointer as "val". Signed-off-by: Steven Rostedt Signed-off-by: Rusty Russell --- include/linux/moduleparam.h | 13 ++++++++++++- kernel/params.c | 6 ++++-- 2 files changed, 16 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 27d9da3f86ff..c3eb102a9cc8 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -36,7 +36,18 @@ static const char __UNIQUE_ID(name)[] \ struct kernel_param; +/* + * Flags available for kernel_param_ops + * + * NOARG - the parameter allows for no argument (foo instead of foo=1) + */ +enum { + KERNEL_PARAM_FL_NOARG = (1 << 0) +}; + struct kernel_param_ops { + /* How the ops should behave */ + unsigned int flags; /* Returns 0, or -errno. arg is in kp->arg. */ int (*set)(const char *val, const struct kernel_param *kp); /* Returns length written or -errno. Buffer is 4k (ie. be short!) */ @@ -187,7 +198,7 @@ struct kparam_array /* Obsolete - use module_param_cb() */ #define module_param_call(name, set, get, arg, perm) \ static struct kernel_param_ops __param_ops_##name = \ - { (void *)set, (void *)get }; \ + { 0, (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ (perm) + sizeof(__check_old_set_param(set))*0, -1) diff --git a/kernel/params.c b/kernel/params.c index 59f7ac7bec04..ec4299cfade8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -103,8 +103,8 @@ static int parse_one(char *param, || params[i].level > max_level) return 0; /* No one handled NULL, so do it here. */ - if (!val && params[i].ops->set != param_set_bool - && params[i].ops->set != param_set_bint) + if (!val && + !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG)) return -EINVAL; pr_debug("handling %s with %p\n", param, params[i].ops->set); @@ -320,6 +320,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp) EXPORT_SYMBOL(param_get_bool); struct kernel_param_ops param_ops_bool = { + .flags = KERNEL_PARAM_FL_NOARG, .set = param_set_bool, .get = param_get_bool, }; @@ -370,6 +371,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp) EXPORT_SYMBOL(param_set_bint); struct kernel_param_ops param_ops_bint = { + .flags = KERNEL_PARAM_FL_NOARG, .set = param_set_bint, .get = param_get_int, }; -- cgit v1.3-8-gc7d7 From 0ce814096f388f6801587f01c1c5ee1d04e746b3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 20 Aug 2013 15:33:19 +0930 Subject: module: Add NOARG flag for ops with param_set_bool_enable_only() set function The ops that uses param_set_bool_enable_only() as its set function can easily handle being used without an argument. There's no reason to fail the loading of the module if it does not have one. Signed-off-by: Steven Rostedt Signed-off-by: Rusty Russell --- kernel/module.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 206915830d29..4eb26b6d6547 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -136,6 +136,7 @@ static int param_set_bool_enable_only(const char *val, } static const struct kernel_param_ops param_ops_bool_enable_only = { + .flags = KERNEL_PARAM_FL_NOARG, .set = param_set_bool_enable_only, .get = param_get_bool, }; -- cgit v1.3-8-gc7d7 From cc56ded3fdd365e07e03315379ee6612a68fd817 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 20 Aug 2013 15:34:21 +0930 Subject: kernel/module.c: use scnprintf() instead of sprintf() For some strings, they are permitted to be larger than PAGE_SIZE, so need use scnprintf() instead of sprintf(), or it will cause issue. One case is: if a module version is crazy defined (length more than PAGE_SIZE), 'modinfo' command is still OK (print full contents), but for "cat /sys/modules/'modname'/version", will cause issue in kernel. Signed-off-by: Chen Gang Signed-off-by: Rusty Russell --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 4eb26b6d6547..40ee1dc3c3bf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -604,7 +604,7 @@ static void setup_modinfo_##field(struct module *mod, const char *s) \ static ssize_t show_modinfo_##field(struct module_attribute *mattr, \ struct module_kobject *mk, char *buffer) \ { \ - return sprintf(buffer, "%s\n", mk->mod->field); \ + return scnprintf(buffer, PAGE_SIZE, "%s\n", mk->mod->field); \ } \ static int modinfo_##field##_exists(struct module *mod) \ { \ -- cgit v1.3-8-gc7d7 From f4940ab7c5992d3fabcda039744fb7657749798e Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Tue, 20 Aug 2013 15:35:04 +0930 Subject: kernel/params.c: use scnprintf() instead of sprintf() For some strings (e.g. version string), they are permitted to be larger than PAGE_SIZE (although meaningless), so recommend to use scnprintf() instead of sprintf(). Signed-off-by: Chen Gang Signed-off-by: Rusty Russell --- kernel/params.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index ec4299cfade8..e5f8f17e57cf 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -241,7 +241,8 @@ int parse_args(const char *doing, } \ int param_get_##name(char *buffer, const struct kernel_param *kp) \ { \ - return sprintf(buffer, format, *((type *)kp->arg)); \ + return scnprintf(buffer, PAGE_SIZE, format, \ + *((type *)kp->arg)); \ } \ struct kernel_param_ops param_ops_##name = { \ .set = param_set_##name, \ @@ -285,7 +286,7 @@ EXPORT_SYMBOL(param_set_charp); int param_get_charp(char *buffer, const struct kernel_param *kp) { - return sprintf(buffer, "%s", *((char **)kp->arg)); + return scnprintf(buffer, PAGE_SIZE, "%s", *((char **)kp->arg)); } EXPORT_SYMBOL(param_get_charp); @@ -829,7 +830,7 @@ ssize_t __modver_version_show(struct module_attribute *mattr, struct module_version_attribute *vattr = container_of(mattr, struct module_version_attribute, mattr); - return sprintf(buf, "%s\n", vattr->version); + return scnprintf(buf, PAGE_SIZE, "%s\n", vattr->version); } extern const struct module_version_attribute *__start___modver[]; -- cgit v1.3-8-gc7d7 From d185af300fe43c130083851ca918ea2bb9600f0f Mon Sep 17 00:00:00 2001 From: Yacine Belkadi Date: Wed, 31 Jul 2013 14:59:24 -0700 Subject: workqueue: fix some scripts/kernel-doc warnings When building the htmldocs (in verbose mode), scripts/kernel-doc reports the following type of warnings: Warning(kernel/workqueue.c:653): No description found for return value of 'get_work_pool' Fix them by: - Using "Return:" sections to introduce descriptions of return values - Adding some missing descriptions Signed-off-by: Yacine Belkadi Signed-off-by: Jiri Kosina --- kernel/workqueue.c | 107 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0b72e816b8d0..7f01a3eeaf95 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -540,6 +540,8 @@ static int worker_pool_assign_id(struct worker_pool *pool) * This must be called either with pwq_lock held or sched RCU read locked. * If the pwq needs to be used beyond the locking in effect, the caller is * responsible for guaranteeing that the pwq stays online. + * + * Return: The unbound pool_workqueue for @node. */ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) @@ -638,8 +640,6 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * get_work_pool - return the worker_pool a given work was associated with * @work: the work item of interest * - * Return the worker_pool @work was last associated with. %NULL if none. - * * Pools are created and destroyed under wq_pool_mutex, and allows read * access under sched-RCU read lock. As such, this function should be * called under wq_pool_mutex or with preemption disabled. @@ -648,6 +648,8 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * mentioned locking is in effect. If the returned pool needs to be used * beyond the critical section, the caller is responsible for ensuring the * returned pool is and stays online. + * + * Return: The worker_pool @work was last associated with. %NULL if none. */ static struct worker_pool *get_work_pool(struct work_struct *work) { @@ -671,7 +673,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) * get_work_pool_id - return the worker pool ID a given work is associated with * @work: the work item of interest * - * Return the worker_pool ID @work was last associated with. + * Return: The worker_pool ID @work was last associated with. * %WORK_OFFQ_POOL_NONE if none. */ static int get_work_pool_id(struct work_struct *work) @@ -830,7 +832,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * CONTEXT: * spin_lock_irq(rq->lock) * - * RETURNS: + * Return: * Worker task on @cpu to wake up, %NULL if none. */ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) @@ -965,8 +967,8 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * CONTEXT: * spin_lock_irq(pool->lock). * - * RETURNS: - * Pointer to worker which is executing @work if found, NULL + * Return: + * Pointer to worker which is executing @work if found, %NULL * otherwise. */ static struct worker *find_worker_executing_work(struct worker_pool *pool, @@ -1154,14 +1156,16 @@ out_put: * @flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any - * stable state - idle, on timer or on worklist. Return values are + * stable state - idle, on timer or on worklist. * + * Return: * 1 if @work was pending and we successfully stole PENDING * 0 if @work was idle and we claimed PENDING * -EAGAIN if PENDING couldn't be grabbed at the moment, safe to busy-retry * -ENOENT if someone else is canceling @work, this state may persist * for arbitrarily long * + * Note: * On >= 0 return, the caller owns @work's PENDING bit. To avoid getting * interrupted while holding PENDING and @work off queue, irq must be * disabled on entry. This, combined with delayed_work->timer being @@ -1403,10 +1407,10 @@ retry: * @wq: workqueue to use * @work: work to queue * - * Returns %false if @work was already on a queue, %true otherwise. - * * We queue the work to a specific CPU, the caller must ensure it * can't go away. + * + * Return: %false if @work was already on a queue, %true otherwise. */ bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work) @@ -1476,7 +1480,7 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, * @dwork: work to queue * @delay: number of jiffies to wait before queueing * - * Returns %false if @work was already on a queue, %true otherwise. If + * Return: %false if @work was already on a queue, %true otherwise. If * @delay is zero and @dwork is idle, it will be scheduled for immediate * execution. */ @@ -1512,7 +1516,7 @@ EXPORT_SYMBOL(queue_delayed_work_on); * zero, @work is guaranteed to be scheduled immediately regardless of its * current state. * - * Returns %false if @dwork was idle and queued, %true if @dwork was + * Return: %false if @dwork was idle and queued, %true if @dwork was * pending and its timer was modified. * * This function is safe to call from any context including IRQ handler. @@ -1627,7 +1631,7 @@ static void worker_leave_idle(struct worker *worker) * Might sleep. Called without any lock but returns with pool->lock * held. * - * RETURNS: + * Return: * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ @@ -1688,7 +1692,7 @@ static struct worker *alloc_worker(void) * CONTEXT: * Might sleep. Does GFP_KERNEL allocations. * - * RETURNS: + * Return: * Pointer to the newly created worker. */ static struct worker *create_worker(struct worker_pool *pool) @@ -1788,6 +1792,8 @@ static void start_worker(struct worker *worker) * @pool: the target pool * * Grab the managership of @pool and create and start a new worker for it. + * + * Return: 0 on success. A negative error code otherwise. */ static int create_and_start_worker(struct worker_pool *pool) { @@ -1932,7 +1938,7 @@ static void pool_mayday_timeout(unsigned long __pool) * multiple times. Does GFP_KERNEL allocations. Called only from * manager. * - * RETURNS: + * Return: * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ @@ -1989,7 +1995,7 @@ restart: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Called only from manager. * - * RETURNS: + * Return: * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ @@ -2032,7 +2038,7 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. * - * RETURNS: + * Return: * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. */ @@ -2246,6 +2252,8 @@ static void process_scheduled_works(struct worker *worker) * work items regardless of their specific target workqueue. The only * exception is work items which belong to workqueues with a rescuer which * will be explained in rescuer_thread(). + * + * Return: 0 */ static int worker_thread(void *__worker) { @@ -2344,6 +2352,8 @@ sleep: * those works so that forward progress can be guaranteed. * * This should happen rarely. + * + * Return: 0 */ static int rescuer_thread(void *__rescuer) { @@ -2516,7 +2526,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * CONTEXT: * mutex_lock(wq->mutex). * - * RETURNS: + * Return: * %true if @flush_color >= 0 and there's something to flush. %false * otherwise. */ @@ -2824,7 +2834,7 @@ already_gone: * Wait until @work has finished execution. @work is guaranteed to be idle * on return if it hasn't been requeued since flush started. * - * RETURNS: + * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ @@ -2884,7 +2894,7 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) * The caller must ensure that the workqueue on which @work was last * queued can't be destroyed before this function returns. * - * RETURNS: + * Return: * %true if @work was pending, %false otherwise. */ bool cancel_work_sync(struct work_struct *work) @@ -2901,7 +2911,7 @@ EXPORT_SYMBOL_GPL(cancel_work_sync); * immediate execution. Like flush_work(), this function only * considers the last queueing instance of @dwork. * - * RETURNS: + * Return: * %true if flush_work() waited for the work to finish execution, * %false if it was already idle. */ @@ -2919,11 +2929,15 @@ EXPORT_SYMBOL(flush_delayed_work); * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel * - * Kill off a pending delayed_work. Returns %true if @dwork was pending - * and canceled; %false if wasn't pending. Note that the work callback - * function may still be running on return, unless it returns %true and the - * work doesn't re-arm itself. Explicitly flush or use - * cancel_delayed_work_sync() to wait on it. + * Kill off a pending delayed_work. + * + * Return: %true if @dwork was pending and canceled; %false if it wasn't + * pending. + * + * Note: + * The work callback function may still be running on return, unless + * it returns %true and the work doesn't re-arm itself. Explicitly flush or + * use cancel_delayed_work_sync() to wait on it. * * This function is safe to call from any context including IRQ handler. */ @@ -2952,7 +2966,7 @@ EXPORT_SYMBOL(cancel_delayed_work); * * This is cancel_work_sync() for delayed works. * - * RETURNS: + * Return: * %true if @dwork was pending, %false otherwise. */ bool cancel_delayed_work_sync(struct delayed_work *dwork) @@ -2969,7 +2983,7 @@ EXPORT_SYMBOL(cancel_delayed_work_sync); * system workqueue and blocks until all CPUs have completed. * schedule_on_each_cpu() is very slow. * - * RETURNS: + * Return: * 0 on success, -errno on failure. */ int schedule_on_each_cpu(work_func_t func) @@ -3037,7 +3051,7 @@ EXPORT_SYMBOL(flush_scheduled_work); * Executes the function immediately if process context is available, * otherwise schedules the function for delayed execution. * - * Returns: 0 - function was executed + * Return: 0 - function was executed * 1 - function was scheduled for execution */ int execute_in_process_context(work_func_t fn, struct execute_work *ew) @@ -3294,7 +3308,7 @@ static void wq_device_release(struct device *dev) * apply_workqueue_attrs() may race against userland updating the * attributes. * - * Returns 0 on success, -errno on failure. + * Return: 0 on success, -errno on failure. */ int workqueue_sysfs_register(struct workqueue_struct *wq) { @@ -3387,7 +3401,9 @@ void free_workqueue_attrs(struct workqueue_attrs *attrs) * @gfp_mask: allocation mask to use * * Allocate a new workqueue_attrs, initialize with default settings and - * return it. Returns NULL on failure. + * return it. + * + * Return: The allocated new workqueue_attr on success. %NULL on failure. */ struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { @@ -3440,7 +3456,8 @@ static bool wqattrs_equal(const struct workqueue_attrs *a, * @pool: worker_pool to initialize * * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. - * Returns 0 on success, -errno on failure. Even on failure, all fields + * + * Return: 0 on success, -errno on failure. Even on failure, all fields * inside @pool proper are initialized and put_unbound_pool() can be called * on @pool safely to release it. */ @@ -3547,9 +3564,12 @@ static void put_unbound_pool(struct worker_pool *pool) * Obtain a worker_pool which has the same attributes as @attrs, bump the * reference count and return it. If there already is a matching * worker_pool, it will be used; otherwise, this function attempts to - * create a new one. On failure, returns NULL. + * create a new one. * * Should be called with wq_pool_mutex held. + * + * Return: On success, a worker_pool with the same attributes as @attrs. + * On failure, %NULL. */ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { @@ -3779,9 +3799,7 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) * * Calculate the cpumask a workqueue with @attrs should use on @node. If * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. This function returns - * %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. + * calculation. The result is stored in @cpumask. * * If NUMA affinity is not enabled, @attrs->cpumask is always used. If * enabled and @node has online CPUs requested by @attrs, the returned @@ -3790,6 +3808,9 @@ static void free_unbound_pwq(struct pool_workqueue *pwq) * * The caller is responsible for ensuring that the cpumask of @node stays * stable. + * + * Return: %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. */ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) @@ -3843,8 +3864,9 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, * items finish. Note that a work item which repeatedly requeues itself * back-to-back will stay on its current pwq. * - * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on - * failure. + * Performs GFP_KERNEL allocations. + * + * Return: 0 on success and -errno on failure. */ int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) @@ -4312,6 +4334,8 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * * Determine whether %current is a workqueue rescuer. Can be used from * work functions to determine whether it's being run off the rescuer task. + * + * Return: %true if %current is a workqueue rescuer. %false otherwise. */ bool current_is_workqueue_rescuer(void) { @@ -4335,7 +4359,7 @@ bool current_is_workqueue_rescuer(void) * workqueue being congested on one CPU doesn't mean the workqueue is also * contested on other CPUs / NUMA nodes. * - * RETURNS: + * Return: * %true if congested, %false otherwise. */ bool workqueue_congested(int cpu, struct workqueue_struct *wq) @@ -4368,7 +4392,7 @@ EXPORT_SYMBOL_GPL(workqueue_congested); * synchronization around this function and the test result is * unreliable and only useful as advisory hints or for debugging. * - * RETURNS: + * Return: * OR'd bitmask of WORK_BUSY_* bits. */ unsigned int work_busy(struct work_struct *work) @@ -4746,9 +4770,10 @@ static void work_for_cpu_fn(struct work_struct *work) * @fn: the function to run * @arg: the function arg * - * This will return the value @fn returns. * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. */ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { @@ -4813,7 +4838,7 @@ void freeze_workqueues_begin(void) * CONTEXT: * Grabs and releases wq_pool_mutex. * - * RETURNS: + * Return: * %true if some freezable workqueues are still busy. %false if freezing * is complete. */ -- cgit v1.3-8-gc7d7 From d2818df168b2c80c7449e47bd349094c308fa323 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 23 Apr 2013 17:05:42 -0700 Subject: rcu: Add duplicate-callback tests to rcutorture This commit adds a object_debug option to rcutorture to allow the debug-object-based checks for duplicate call_rcu() invocations to be deterministically tested. Signed-off-by: Paul E. McKenney Cc: Mathieu Desnoyers Cc: Sedat Dilek Cc: Davidlohr Bueso Cc: Rik van Riel Cc: Thomas Gleixner Cc: Linus Torvalds Tested-by: Sedat Dilek [ paulmck: Banish mid-function ifdef, more or less per Josh Triplett. ] Reviewed-by: Josh Triplett [ paulmck: Improve duplicate-callback test, per Lai Jiangshan. ] --- kernel/rcutorture.c | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 3d936f0fbcd8..c898f14a5b7d 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -66,6 +66,7 @@ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ +static int object_debug; /* Test object-debug double call_rcu()?. */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ @@ -100,6 +101,8 @@ module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); module_param(n_barrier_cbs, int, 0444); MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); +module_param(object_debug, int, 0444); +MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); module_param(onoff_interval, int, 0444); MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); module_param(onoff_holdoff, int, 0444); @@ -1934,6 +1937,62 @@ rcu_torture_cleanup(void) rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); } +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD +static void rcu_torture_leak_cb(struct rcu_head *rhp) +{ +} + +static void rcu_torture_err_cb(struct rcu_head *rhp) +{ + /* + * This -might- happen due to race conditions, but is unlikely. + * The scenario that leads to this happening is that the + * first of the pair of duplicate callbacks is queued, + * someone else starts a grace period that includes that + * callback, then the second of the pair must wait for the + * next grace period. Unlikely, but can happen. If it + * does happen, the debug-objects subsystem won't have splatted. + */ + pr_alert("rcutorture: duplicated callback was invoked.\n"); +} +#endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + +/* + * Verify that double-free causes debug-objects to complain, but only + * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test + * cannot be carried out. + */ +static void rcu_test_debug_objects(void) +{ +#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD + struct rcu_head rh1; + struct rcu_head rh2; + + init_rcu_head_on_stack(&rh1); + init_rcu_head_on_stack(&rh2); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); + + /* Try to queue the rh2 pair of callbacks for the same grace period. */ + preempt_disable(); /* Prevent preemption from interrupting test. */ + rcu_read_lock(); /* Make it impossible to finish a grace period. */ + call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ + local_irq_disable(); /* Make it harder to start a new grace period. */ + call_rcu(&rh2, rcu_torture_leak_cb); + call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ + local_irq_enable(); + rcu_read_unlock(); + preempt_enable(); + + /* Wait for them all to get done so we can safely return. */ + rcu_barrier(); + pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); + destroy_rcu_head_on_stack(&rh1); + destroy_rcu_head_on_stack(&rh2); +#else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ + pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); +#endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ +} + static int __init rcu_torture_init(void) { @@ -2163,6 +2222,8 @@ rcu_torture_init(void) firsterr = retval; goto unwind; } + if (object_debug) + rcu_test_debug_objects(); rcutorture_record_test_transition(); mutex_unlock(&fullstop_mutex); return 0; -- cgit v1.3-8-gc7d7 From 2ec1f2d98752293f4831ce7d7bdbc3fc36bdd114 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 12 Jun 2013 15:12:21 -0700 Subject: rcu: Increase rcutorture test coverage Currently, rcutorture has separate torture_types to test synchronous, asynchronous, and expedited grace-period primitives. This has two disadvantages: (1) Three times the number of runs to cover the combinations and (2) Little testing of concurrent combinations of the three options. This commit therefore adds a pair of module parameters that control normal and expedited state, with the default being both types, randomly selected, by the fakewriter processes, thus reducing source-code size and increasing test coverage. In addtion, the writer task switches between asynchronous-normal and expedited grace-period primitives driven by the same pair of module parameters. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- Documentation/RCU/torture.txt | 10 ++ kernel/rcutorture.c | 226 ++++++++++++------------------------------ 2 files changed, 73 insertions(+), 163 deletions(-) (limited to 'kernel') diff --git a/Documentation/RCU/torture.txt b/Documentation/RCU/torture.txt index d8a502387397..dac02a6219b1 100644 --- a/Documentation/RCU/torture.txt +++ b/Documentation/RCU/torture.txt @@ -42,6 +42,16 @@ fqs_holdoff Holdoff time (in microseconds) between consecutive calls fqs_stutter Wait time (in seconds) between consecutive bursts of calls to force_quiescent_state(). +gp_normal Make the fake writers use normal synchronous grace-period + primitives. + +gp_exp Make the fake writers use expedited synchronous grace-period + primitives. If both gp_normal and gp_exp are set, or + if neither gp_normal nor gp_exp are set, then randomly + choose the primitive so that about 50% are normal and + 50% expedited. By default, neither are set, which + gives best overall test coverage. + irqreader Says to invoke RCU readers from irq level. This is currently done via timers. Defaults to "1" for variants of RCU that permit this. (Or, more accurately, variants of RCU that do diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c898f14a5b7d..ddef61871878 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -65,6 +65,8 @@ static int irqreader = 1; /* RCU readers from irq (timers). */ static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ static int fqs_holdoff; /* Hold time within burst (us). */ static int fqs_stutter = 3; /* Wait time between bursts (s). */ +static bool gp_exp; /* Use expedited GP wait primitives. */ +static bool gp_normal; /* Use normal GP wait primitives. */ static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ static int object_debug; /* Test object-debug double call_rcu()?. */ static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ @@ -99,6 +101,10 @@ module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +module_param(gp_exp, bool, 0444); +MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); module_param(n_barrier_cbs, int, 0444); MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); module_param(object_debug, int, 0444); @@ -363,6 +369,7 @@ struct rcu_torture_ops { int (*completed)(void); void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); + void (*exp_sync)(void); void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); void (*cb_barrier)(void); void (*fqs)(void); @@ -446,81 +453,27 @@ static void rcu_torture_deferred_free(struct rcu_torture *p) call_rcu(&p->rtort_rcu, rcu_torture_cb); } -static struct rcu_torture_ops rcu_ops = { - .init = NULL, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, - .readunlock = rcu_torture_read_unlock, - .completed = rcu_torture_completed, - .deferred_free = rcu_torture_deferred_free, - .sync = synchronize_rcu, - .call = call_rcu, - .cb_barrier = rcu_barrier, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu" -}; - -static void rcu_sync_torture_deferred_free(struct rcu_torture *p) -{ - int i; - struct rcu_torture *rp; - struct rcu_torture *rp1; - - cur_ops->sync(); - list_add(&p->rtort_free, &rcu_torture_removed); - list_for_each_entry_safe(rp, rp1, &rcu_torture_removed, rtort_free) { - i = rp->rtort_pipe_count; - if (i > RCU_TORTURE_PIPE_LEN) - i = RCU_TORTURE_PIPE_LEN; - atomic_inc(&rcu_torture_wcount[i]); - if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { - rp->rtort_mbtest = 0; - list_del(&rp->rtort_free); - rcu_torture_free(rp); - } - } -} - static void rcu_sync_torture_init(void) { INIT_LIST_HEAD(&rcu_torture_removed); } -static struct rcu_torture_ops rcu_sync_ops = { +static struct rcu_torture_ops rcu_ops = { .init = rcu_sync_torture_init, .readlock = rcu_torture_read_lock, .read_delay = rcu_read_delay, .readunlock = rcu_torture_read_unlock, .completed = rcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, + .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .can_boost = rcu_can_boost(), - .name = "rcu_sync" -}; - -static struct rcu_torture_ops rcu_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_expedited, - .call = NULL, - .cb_barrier = NULL, + .exp_sync = synchronize_rcu_expedited, + .call = call_rcu, + .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .can_boost = rcu_can_boost(), - .name = "rcu_expedited" + .name = "rcu" }; /* @@ -549,13 +502,14 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p) } static struct rcu_torture_ops rcu_bh_ops = { - .init = NULL, + .init = rcu_sync_torture_init, .readlock = rcu_bh_torture_read_lock, .read_delay = rcu_read_delay, /* just reuse rcu's version. */ .readunlock = rcu_bh_torture_read_unlock, .completed = rcu_bh_torture_completed, .deferred_free = rcu_bh_torture_deferred_free, .sync = synchronize_rcu_bh, + .exp_sync = synchronize_rcu_bh_expedited, .call = call_rcu_bh, .cb_barrier = rcu_barrier_bh, .fqs = rcu_bh_force_quiescent_state, @@ -564,38 +518,6 @@ static struct rcu_torture_ops rcu_bh_ops = { .name = "rcu_bh" }; -static struct rcu_torture_ops rcu_bh_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_sync" -}; - -static struct rcu_torture_ops rcu_bh_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = rcu_bh_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = rcu_bh_torture_read_unlock, - .completed = rcu_bh_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_rcu_bh_expedited, - .call = NULL, - .cb_barrier = NULL, - .fqs = rcu_bh_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "rcu_bh_expedited" -}; - /* * Definitions for srcu torture testing. */ @@ -670,6 +592,11 @@ static int srcu_torture_stats(char *page) return cnt; } +static void srcu_torture_synchronize_expedited(void) +{ + synchronize_srcu_expedited(&srcu_ctl); +} + static struct rcu_torture_ops srcu_ops = { .init = rcu_sync_torture_init, .readlock = srcu_torture_read_lock, @@ -678,45 +605,13 @@ static struct rcu_torture_ops srcu_ops = { .completed = srcu_torture_completed, .deferred_free = srcu_torture_deferred_free, .sync = srcu_torture_synchronize, + .exp_sync = srcu_torture_synchronize_expedited, .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, .name = "srcu" }; -static struct rcu_torture_ops srcu_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_sync" -}; - -static void srcu_torture_synchronize_expedited(void) -{ - synchronize_srcu_expedited(&srcu_ctl); -} - -static struct rcu_torture_ops srcu_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = srcu_torture_read_lock, - .read_delay = srcu_read_delay, - .readunlock = srcu_torture_read_unlock, - .completed = srcu_torture_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = srcu_torture_synchronize_expedited, - .call = NULL, - .cb_barrier = NULL, - .stats = srcu_torture_stats, - .name = "srcu_expedited" -}; - /* * Definitions for sched torture testing. */ @@ -745,6 +640,8 @@ static struct rcu_torture_ops sched_ops = { .completed = rcu_no_completed, .deferred_free = rcu_sched_torture_deferred_free, .sync = synchronize_sched, + .exp_sync = synchronize_sched_expedited, + .call = call_rcu_sched, .cb_barrier = rcu_barrier_sched, .fqs = rcu_sched_force_quiescent_state, .stats = NULL, @@ -752,35 +649,6 @@ static struct rcu_torture_ops sched_ops = { .name = "sched" }; -static struct rcu_torture_ops sched_sync_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .name = "sched_sync" -}; - -static struct rcu_torture_ops sched_expedited_ops = { - .init = rcu_sync_torture_init, - .readlock = sched_torture_read_lock, - .read_delay = rcu_read_delay, /* just reuse rcu's version. */ - .readunlock = sched_torture_read_unlock, - .completed = rcu_no_completed, - .deferred_free = rcu_sync_torture_deferred_free, - .sync = synchronize_sched_expedited, - .cb_barrier = NULL, - .fqs = rcu_sched_force_quiescent_state, - .stats = NULL, - .irq_capable = 1, - .name = "sched_expedited" -}; - /* * RCU torture priority-boost testing. Runs one real-time thread per * CPU for moderate bursts, repeatedly registering RCU callbacks and @@ -930,9 +798,11 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool exp; int i; long oldbatch = rcu_batches_completed(); struct rcu_torture *rp; + struct rcu_torture *rp1; struct rcu_torture *old_rp; static DEFINE_RCU_RANDOM(rand); @@ -957,7 +827,31 @@ rcu_torture_writer(void *arg) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); old_rp->rtort_pipe_count++; - cur_ops->deferred_free(old_rp); + if (gp_normal == gp_exp) + exp = !!(rcu_random(&rand) & 0x80); + else + exp = gp_exp; + if (!exp) { + cur_ops->deferred_free(old_rp); + } else { + cur_ops->exp_sync(); + list_add(&old_rp->rtort_free, + &rcu_torture_removed); + list_for_each_entry_safe(rp, rp1, + &rcu_torture_removed, + rtort_free) { + i = rp->rtort_pipe_count; + if (i > RCU_TORTURE_PIPE_LEN) + i = RCU_TORTURE_PIPE_LEN; + atomic_inc(&rcu_torture_wcount[i]); + if (++rp->rtort_pipe_count >= + RCU_TORTURE_PIPE_LEN) { + rp->rtort_mbtest = 0; + list_del(&rp->rtort_free); + rcu_torture_free(rp); + } + } + } } rcutorture_record_progress(++rcu_torture_current_version); oldbatch = cur_ops->completed(); @@ -986,10 +880,18 @@ rcu_torture_fakewriter(void *arg) schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); udelay(rcu_random(&rand) & 0x3ff); if (cur_ops->cb_barrier != NULL && - rcu_random(&rand) % (nfakewriters * 8) == 0) + rcu_random(&rand) % (nfakewriters * 8) == 0) { cur_ops->cb_barrier(); - else + } else if (gp_normal == gp_exp) { + if (rcu_random(&rand) & 0x80) + cur_ops->sync(); + else + cur_ops->exp_sync(); + } else if (gp_normal) { cur_ops->sync(); + } else { + cur_ops->exp_sync(); + } rcu_stutter_wait("rcu_torture_fakewriter"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); @@ -2000,11 +1902,9 @@ rcu_torture_init(void) int cpu; int firsterr = 0; int retval; - static struct rcu_torture_ops *torture_ops[] = - { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops, - &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops, - &srcu_ops, &srcu_sync_ops, &srcu_expedited_ops, - &sched_ops, &sched_sync_ops, &sched_expedited_ops, }; + static struct rcu_torture_ops *torture_ops[] = { + &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, + }; mutex_lock(&fullstop_mutex); -- cgit v1.3-8-gc7d7 From d10453e9742f4711b004caae7741476073b4f603 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Jun 2013 15:12:24 -0700 Subject: rcu: Sort rcutorture module parameters There are getting to be too many module parameters to permit the current semi-random order, so this patch orders them. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 101 +++++++++++++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index ddef61871878..e3a1244eeb56 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -52,81 +52,78 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Paul E. McKenney and Josh Triplett "); -static int nreaders = -1; /* # reader threads, defaults to 2*ncpus */ -static int nfakewriters = 4; /* # fake writer threads */ -static int stat_interval = 60; /* Interval between stats, in seconds. */ - /* Zero means "only at end of test". */ -static bool verbose; /* Print more debug info. */ -static bool test_no_idle_hz = true; - /* Test RCU support for tickless idle CPUs. */ -static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ -static int stutter = 5; /* Start/stop testing interval (in sec) */ -static int irqreader = 1; /* RCU readers from irq (timers). */ -static int fqs_duration; /* Duration of bursts (us), 0 to disable. */ -static int fqs_holdoff; /* Hold time within burst (us). */ -static int fqs_stutter = 3; /* Wait time between bursts (s). */ -static bool gp_exp; /* Use expedited GP wait primitives. */ -static bool gp_normal; /* Use normal GP wait primitives. */ -static int n_barrier_cbs; /* Number of callbacks to test RCU barriers. */ -static int object_debug; /* Test object-debug double call_rcu()?. */ -static int onoff_interval; /* Wait time between CPU hotplugs, 0=disable. */ -static int onoff_holdoff; /* Seconds after boot before CPU hotplugs. */ -static int shutdown_secs; /* Shutdown time (s). <=0 for no shutdown. */ -static int stall_cpu; /* CPU-stall duration (s). 0 for no stall. */ -static int stall_cpu_holdoff = 10; /* Time to wait until stall (s). */ -static int test_boost = 1; /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */ -static int test_boost_interval = 7; /* Interval between boost tests, seconds. */ -static int test_boost_duration = 4; /* Duration of each boost test, seconds. */ -static char *torture_type = "rcu"; /* What RCU implementation to torture. */ - -module_param(nreaders, int, 0444); -MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); -module_param(nfakewriters, int, 0444); -MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); -module_param(stat_interval, int, 0644); -MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); -module_param(verbose, bool, 0444); -MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); -module_param(test_no_idle_hz, bool, 0444); -MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); -module_param(shuffle_interval, int, 0444); -MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); -module_param(stutter, int, 0444); -MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); -module_param(irqreader, int, 0444); -MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int fqs_duration; module_param(fqs_duration, int, 0444); -MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); +static int fqs_holdoff; module_param(fqs_holdoff, int, 0444); MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +static int fqs_stutter = 3; module_param(fqs_stutter, int, 0444); MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); -module_param(gp_normal, bool, 0444); -MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static bool gp_exp; module_param(gp_exp, bool, 0444); MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); +static bool gp_normal; +module_param(gp_normal, bool, 0444); +MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); +static int irqreader = 1; +module_param(irqreader, int, 0444); +MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +static int n_barrier_cbs; module_param(n_barrier_cbs, int, 0444); MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); +static int nfakewriters = 4; +module_param(nfakewriters, int, 0444); +MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); +static int nreaders = -1; +module_param(nreaders, int, 0444); +MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); +static int object_debug; module_param(object_debug, int, 0444); MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); -module_param(onoff_interval, int, 0444); -MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int onoff_holdoff; module_param(onoff_holdoff, int, 0444); MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); +static int onoff_interval; +module_param(onoff_interval, int, 0444); +MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); +static int shuffle_interval = 3; +module_param(shuffle_interval, int, 0444); +MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); +static int shutdown_secs; module_param(shutdown_secs, int, 0444); -MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable."); +MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); +static int stall_cpu; module_param(stall_cpu, int, 0444); MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); +static int stall_cpu_holdoff = 10; module_param(stall_cpu_holdoff, int, 0444); MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); +static int stat_interval = 60; +module_param(stat_interval, int, 0644); +MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); +static int stutter = 5; +module_param(stutter, int, 0444); +MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); +static int test_boost = 1; module_param(test_boost, int, 0444); MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); -module_param(test_boost_interval, int, 0444); -MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static int test_boost_duration = 4; module_param(test_boost_duration, int, 0444); MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); +static int test_boost_interval = 7; +module_param(test_boost_interval, int, 0444); +MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); +static bool test_no_idle_hz = true; +module_param(test_no_idle_hz, bool, 0444); +MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); +static char *torture_type = "rcu"; module_param(torture_type, charp, 0444); -MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); +MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); +static bool verbose; +module_param(verbose, bool, 0444); +MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); #define TORTURE_FLAG "-torture:" #define PRINTK_STRING(s) \ -- cgit v1.3-8-gc7d7 From ef47db8e99d53f0da5270accd2ee71bcf9e25f11 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 13 Jun 2013 15:30:00 -0700 Subject: rcu: Remove unused variable from rcu_torture_writer() The oldbatch variable in rcu_torture_writer() is stored to, but never loaded from. This commit therefore removes it. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index e3a1244eeb56..20ce3b6f9aa6 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -797,7 +797,6 @@ rcu_torture_writer(void *arg) { bool exp; int i; - long oldbatch = rcu_batches_completed(); struct rcu_torture *rp; struct rcu_torture *rp1; struct rcu_torture *old_rp; @@ -851,7 +850,6 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); - oldbatch = cur_ops->completed(); rcu_stutter_wait("rcu_torture_writer"); } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); -- cgit v1.3-8-gc7d7 From 7a6a41073c345ff5ef5e81317211481c0da3f7f3 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 06:24:56 -0700 Subject: rcu: Make rcutorture emit online failures if verbose Although rcutorture counts CPU-hotplug online failures, it does not explicitly record which CPUs were having trouble coming online. This commit therefore emits a console message when online failure occurs. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- kernel/rcutorture.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 20ce3b6f9aa6..be63101c6175 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -1434,7 +1434,13 @@ rcu_torture_onoff(void *arg) torture_type, cpu); starttime = jiffies; n_online_attempts++; - if (cpu_up(cpu) == 0) { + ret = cpu_up(cpu); + if (ret) { + if (verbose) + pr_alert("%s" TORTURE_FLAG + "rcu_torture_onoff task: online %d failed: errno %d\n", + torture_type, cpu, ret); + } else { if (verbose) pr_alert("%s" TORTURE_FLAG "rcu_torture_onoff task: onlined %d\n", -- cgit v1.3-8-gc7d7 From 458fb381eacdd23366cfa2fbdf5a467848683e3a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 26 Jul 2013 20:47:42 -0700 Subject: rcu: Simplify _rcu_barrier() processing This commit drops an unneeded ACCESS_ONCE() and simplifies an "our work is done" check in _rcu_barrier(). This applies feedback from Linus (https://lkml.org/lkml/2013/7/26/777) that he gave to similar code in an unrelated patch. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett [ paulmck: Fix comment to match code, reported by Lai Jiangshan. ] --- kernel/rcutree.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index c6a064abd6a0..a4a04f311cfb 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -2817,9 +2817,20 @@ static void _rcu_barrier(struct rcu_state *rsp) * transition. The "if" expression below therefore rounds the old * value up to the next even number and adds two before comparing. */ - snap_done = ACCESS_ONCE(rsp->n_barrier_done); + snap_done = rsp->n_barrier_done; _rcu_barrier_trace(rsp, "Check", -1, snap_done); - if (ULONG_CMP_GE(snap_done, ((snap + 1) & ~0x1) + 2)) { + + /* + * If the value in snap is odd, we needed to wait for the current + * rcu_barrier() to complete, then wait for the next one, in other + * words, we need the value of snap_done to be three larger than + * the value of snap. On the other hand, if the value in snap is + * even, we only had to wait for the next rcu_barrier() to complete, + * in other words, we need the value of snap_done to be only two + * greater than the value of snap. The "(snap + 3) & ~0x1" computes + * this for us (thank you, Linus!). + */ + if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); smp_mb(); /* caller's subsequent code after above check. */ mutex_unlock(&rsp->barrier_mutex); -- cgit v1.3-8-gc7d7 From b11895c45899daff094610f6cdbf7611d74ae2a6 Mon Sep 17 00:00:00 2001 From: Libin Date: Wed, 21 Aug 2013 08:50:39 +0800 Subject: workqueue: Comment correction in file header No functional change. There are two worker pools for each cpu in current implementation (one for normal work items and the other for high priority ones). tj: Whitespace adjustments. Signed-off-by: Libin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f02c4a4a0c3c..eebd9a66c044 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -16,9 +16,10 @@ * * This is the generic async execution mechanism. Work items as are * executed in process context. The worker pool is shared and - * automatically managed. There is one worker pool for each CPU and - * one extra for works which are better served by workers which are - * not bound to any specific CPU. + * automatically managed. There are two worker pools for each CPU (one for + * normal work items and the other for high priority ones) and some extra + * pools for workqueues which are not bound to any specific CPU - the + * number of these backing pools is dynamic. * * Please read Documentation/workqueue.txt for details. */ -- cgit v1.3-8-gc7d7 From 2d498db9814c6f3a79b708c8867c7ffcf7b5e2fc Mon Sep 17 00:00:00 2001 From: Libin Date: Wed, 21 Aug 2013 08:50:40 +0800 Subject: workqueue: Fix manage_workers() RETURNS description No functional change. The comment of function manage_workers() RETURNS description is obvious wrong, same as the CONTEXT. Fix it. Signed-off-by: Libin Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index eebd9a66c044..10f655ec8de6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2034,8 +2034,11 @@ static bool maybe_destroy_workers(struct worker_pool *pool) * multiple times. Does GFP_KERNEL allocations. * * RETURNS: - * spin_lock_irq(pool->lock) which may be released and regrabbed - * multiple times. Does GFP_KERNEL allocations. + * %false if the pool don't need management and the caller can safely start + * processing works, %true indicates that the function released pool->lock + * and reacquired it to perform some management function and that the + * conditions that the caller verified while holding the lock before + * calling the function might no longer be true. */ static bool manage_workers(struct worker *worker) { -- cgit v1.3-8-gc7d7 From 3ddc77f6f4a58ee2e49e0e8c0216105c7f8ddd8c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 27 Mar 2013 14:15:37 +0800 Subject: tracing/syscalls: Annotate raw_init function with __init init_syscall_trace() can only be called during kernel bootup only, so we can mark it and the functions it calls as __init. Link: http://lkml.kernel.org/r/51528E89.6080508@huawei.com Signed-off-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_syscalls.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8fd03657bc7d..559329d9bd2f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -200,8 +200,8 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -static -int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) +static int __init +__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { int i; int pos = 0; @@ -228,7 +228,7 @@ int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) return pos; } -static int set_syscall_print_fmt(struct ftrace_event_call *call) +static int __init set_syscall_print_fmt(struct ftrace_event_call *call) { char *print_fmt; int len; @@ -253,7 +253,7 @@ static int set_syscall_print_fmt(struct ftrace_event_call *call) return 0; } -static void free_syscall_print_fmt(struct ftrace_event_call *call) +static void __init free_syscall_print_fmt(struct ftrace_event_call *call) { struct syscall_metadata *entry = call->data; @@ -459,7 +459,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file, mutex_unlock(&syscall_trace_lock); } -static int init_syscall_trace(struct ftrace_event_call *call) +static int __init init_syscall_trace(struct ftrace_event_call *call) { int id; int num; -- cgit v1.3-8-gc7d7 From 779c5e379158de3e96112630c543d3c7b37efab9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jul 2013 19:31:32 +0200 Subject: tracing: Kill trace_create_file_ops() and friends trace_create_file_ops() allocates the copy of id/filter/format/enable file_operations to set "f_op->owner = mod" for fops_get(). However after the recent changes there is no reason to prevent rmmod even if one of these files is opened. A file operation can do nothing but fail after remove_event_file_dir() clears ->i_private for every file removed by trace_module_remove_events(). Kill "struct ftrace_module_file_ops" and fix the compilation errors. Link: http://lkml.kernel.org/r/20130731173132.GA31033@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 153 +++----------------------------------------- 1 file changed, 9 insertions(+), 144 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 29a7ebcfb426..2ec82734b8a7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1683,8 +1683,7 @@ __trace_early_add_new_event(struct ftrace_event_call *call, } struct ftrace_module_file_ops; -static void __add_event_to_tracers(struct ftrace_event_call *call, - struct ftrace_module_file_ops *file_ops); +static void __add_event_to_tracers(struct ftrace_event_call *call); /* Add an additional event_call dynamically */ int trace_add_event_call(struct ftrace_event_call *call) @@ -1695,7 +1694,7 @@ int trace_add_event_call(struct ftrace_event_call *call) ret = __register_event(call, NULL); if (ret >= 0) - __add_event_to_tracers(call, NULL); + __add_event_to_tracers(call); mutex_unlock(&event_mutex); mutex_unlock(&trace_types_lock); @@ -1769,100 +1768,21 @@ int trace_remove_event_call(struct ftrace_event_call *call) #ifdef CONFIG_MODULES -static LIST_HEAD(ftrace_module_file_list); - -/* - * Modules must own their file_operations to keep up with - * reference counting. - */ -struct ftrace_module_file_ops { - struct list_head list; - struct module *mod; - struct file_operations id; - struct file_operations enable; - struct file_operations format; - struct file_operations filter; -}; - -static struct ftrace_module_file_ops * -find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) -{ - /* - * As event_calls are added in groups by module, - * when we find one file_ops, we don't need to search for - * each call in that module, as the rest should be the - * same. Only search for a new one if the last one did - * not match. - */ - if (file_ops && mod == file_ops->mod) - return file_ops; - - list_for_each_entry(file_ops, &ftrace_module_file_list, list) { - if (file_ops->mod == mod) - return file_ops; - } - return NULL; -} - -static struct ftrace_module_file_ops * -trace_create_file_ops(struct module *mod) -{ - struct ftrace_module_file_ops *file_ops; - - /* - * This is a bit of a PITA. To allow for correct reference - * counting, modules must "own" their file_operations. - * To do this, we allocate the file operations that will be - * used in the event directory. - */ - - file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); - if (!file_ops) - return NULL; - - file_ops->mod = mod; - - file_ops->id = ftrace_event_id_fops; - file_ops->id.owner = mod; - - file_ops->enable = ftrace_enable_fops; - file_ops->enable.owner = mod; - - file_ops->filter = ftrace_event_filter_fops; - file_ops->filter.owner = mod; - - file_ops->format = ftrace_event_format_fops; - file_ops->format.owner = mod; - - list_add(&file_ops->list, &ftrace_module_file_list); - - return file_ops; -} - static void trace_module_add_events(struct module *mod) { - struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call **call, **start, **end; start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; - if (start == end) - return; - - file_ops = trace_create_file_ops(mod); - if (!file_ops) - return; - for_each_event(call, start, end) { __register_event(*call, mod); - __add_event_to_tracers(*call, file_ops); + __add_event_to_tracers(*call); } } static void trace_module_remove_events(struct module *mod) { - struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; bool clear_trace = false; @@ -1874,16 +1794,6 @@ static void trace_module_remove_events(struct module *mod) __trace_remove_event_call(call); } } - - /* Now free the file_operations */ - list_for_each_entry(file_ops, &ftrace_module_file_list, list) { - if (file_ops->mod == mod) - break; - } - if (&file_ops->list != &ftrace_module_file_list) { - list_del(&file_ops->list); - kfree(file_ops); - } up_write(&trace_event_sem); /* @@ -1919,62 +1829,22 @@ static int trace_module_notify(struct notifier_block *self, return 0; } -static int -__trace_add_new_mod_event(struct ftrace_event_call *call, - struct trace_array *tr, - struct ftrace_module_file_ops *file_ops) -{ - return __trace_add_new_event(call, tr, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); -} - #else -static inline struct ftrace_module_file_ops * -find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) -{ - return NULL; -} static inline int trace_module_notify(struct notifier_block *self, unsigned long val, void *data) { return 0; } -static inline int -__trace_add_new_mod_event(struct ftrace_event_call *call, - struct trace_array *tr, - struct ftrace_module_file_ops *file_ops) -{ - return -ENODEV; -} #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ static void __trace_add_event_dirs(struct trace_array *tr) { - struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call; int ret; list_for_each_entry(call, &ftrace_events, list) { - if (call->mod) { - /* - * Directories for events by modules need to - * keep module ref counts when opened (as we don't - * want the module to disappear when reading one - * of these files). The file_ops keep account of - * the module ref count. - */ - file_ops = find_ftrace_file_ops(file_ops, call->mod); - if (!file_ops) - continue; /* Warn? */ - ret = __trace_add_new_mod_event(call, tr, file_ops); - if (ret < 0) - pr_warning("Could not create directory for event %s\n", - call->name); - continue; - } ret = __trace_add_new_event(call, tr, &ftrace_event_id_fops, &ftrace_enable_fops, @@ -2332,21 +2202,16 @@ __trace_remove_event_dirs(struct trace_array *tr) remove_event_file_dir(file); } -static void -__add_event_to_tracers(struct ftrace_event_call *call, - struct ftrace_module_file_ops *file_ops) +static void __add_event_to_tracers(struct ftrace_event_call *call) { struct trace_array *tr; list_for_each_entry(tr, &ftrace_trace_arrays, list) { - if (file_ops) - __trace_add_new_mod_event(call, tr, file_ops); - else - __trace_add_new_event(call, tr, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); + __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); } } -- cgit v1.3-8-gc7d7 From 620a30e97febc8332590376c94ed0e9dba522bc8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jul 2013 19:31:35 +0200 Subject: tracing: Don't pass file_operations array to event_create_dir() Now that event_create_dir() and __trace_add_new_event() always use the same file_operations we can kill these arguments and simplify the code. Link: http://lkml.kernel.org/r/20130731173135.GA31040@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 46 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2ec82734b8a7..4e706a01f1f9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1489,12 +1489,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } static int -event_create_dir(struct dentry *parent, - struct ftrace_event_file *file, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +event_create_dir(struct dentry *parent, struct ftrace_event_file *file) { struct ftrace_event_call *call = file->event_call; struct trace_array *tr = file->tr; @@ -1522,12 +1517,13 @@ event_create_dir(struct dentry *parent, if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) trace_create_file("enable", 0644, file->dir, file, - enable); + &ftrace_enable_fops); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) trace_create_file("id", 0444, file->dir, - (void *)(long)call->event.type, id); + (void *)(long)call->event.type, + &ftrace_event_id_fops); #endif /* @@ -1544,10 +1540,10 @@ event_create_dir(struct dentry *parent, } } trace_create_file("filter", 0644, file->dir, call, - filter); + &ftrace_event_filter_fops); trace_create_file("format", 0444, file->dir, call, - format); + &ftrace_event_format_fops); return 0; } @@ -1648,12 +1644,7 @@ trace_create_new_event(struct ftrace_event_call *call, /* Add an event to a trace directory */ static int -__trace_add_new_event(struct ftrace_event_call *call, - struct trace_array *tr, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +__trace_add_new_event(struct ftrace_event_call *call, struct trace_array *tr) { struct ftrace_event_file *file; @@ -1661,7 +1652,7 @@ __trace_add_new_event(struct ftrace_event_call *call, if (!file) return -ENOMEM; - return event_create_dir(tr->event_dir, file, id, enable, filter, format); + return event_create_dir(tr->event_dir, file); } /* @@ -1845,11 +1836,7 @@ __trace_add_event_dirs(struct trace_array *tr) int ret; list_for_each_entry(call, &ftrace_events, list) { - ret = __trace_add_new_event(call, tr, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = __trace_add_new_event(call, tr); if (ret < 0) pr_warning("Could not create directory for event %s\n", call->name); @@ -2157,11 +2144,7 @@ __trace_early_add_event_dirs(struct trace_array *tr) list_for_each_entry(file, &tr->events, list) { - ret = event_create_dir(tr->event_dir, file, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); + ret = event_create_dir(tr->event_dir, file); if (ret < 0) pr_warning("Could not create directory for event %s\n", file->event_call->name); @@ -2206,13 +2189,8 @@ static void __add_event_to_tracers(struct ftrace_event_call *call) { struct trace_array *tr; - list_for_each_entry(tr, &ftrace_trace_arrays, list) { - __trace_add_new_event(call, tr, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - } + list_for_each_entry(tr, &ftrace_trace_arrays, list) + __trace_add_new_event(call, tr); } static struct notifier_block trace_module_nb = { -- cgit v1.3-8-gc7d7 From 836d481ed7c91152c6144ea3a3363cad3940b3e0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jul 2013 19:31:37 +0200 Subject: tracing: Kill the !CONFIG_MODULES code in trace_events.c Move trace_module_nb under CONFIG_MODULES and kill the dummy trace_module_notify(). Imho it doesn't make sense to define "struct notifier_block" and its .notifier_call just to avoid "ifdef" in event_trace_init(), and all other !CONFIG_MODULES code has already gone away. Link: http://lkml.kernel.org/r/20130731173137.GA31043@redhat.com Signed-off-by: Oleg Nesterov Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4e706a01f1f9..368a4d50cc30 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1820,12 +1820,10 @@ static int trace_module_notify(struct notifier_block *self, return 0; } -#else -static inline int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} +static struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; #endif /* CONFIG_MODULES */ /* Create a new event directory structure for a trace directory. */ @@ -2193,11 +2191,6 @@ static void __add_event_to_tracers(struct ftrace_event_call *call) __trace_add_new_event(call, tr); } -static struct notifier_block trace_module_nb = { - .notifier_call = trace_module_notify, - .priority = 0, -}; - extern struct ftrace_event_call *__start_ftrace_events[]; extern struct ftrace_event_call *__stop_ftrace_events[]; @@ -2402,10 +2395,11 @@ static __init int event_trace_init(void) if (ret) return ret; +#ifdef CONFIG_MODULES ret = register_module_notifier(&trace_module_nb); if (ret) pr_warning("Failed to register trace events module notifier\n"); - +#endif return 0; } early_initcall(event_trace_memsetup); -- cgit v1.3-8-gc7d7 From ccfe9e42e451232dd17a230d1b4e979c3d15311e Mon Sep 17 00:00:00 2001 From: Alexander Z Lam Date: Thu, 8 Aug 2013 09:47:45 -0700 Subject: tracing: Make tracing_cpumask available for all instances Allow tracer instances to disable tracing by cpu by moving the static global tracing_cpumask into trace_array. Link: http://lkml.kernel.org/r/921622317f239bfc2283cac2242647801ef584f2.1375980149.git.azl@google.com Cc: Vaibhav Nagarnaik Cc: David Sharp Cc: Alexander Z Lam Signed-off-by: Alexander Z Lam Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 37 ++++++++++++++++++++----------------- kernel/trace/trace.h | 1 + 2 files changed, 21 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 496f94d57698..7974ba20557d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3165,11 +3165,6 @@ static const struct file_operations show_traces_fops = { .llseek = seq_lseek, }; -/* - * Only trace on a CPU if the bitmask is set: - */ -static cpumask_var_t tracing_cpumask; - /* * The tracer itself will not take this lock, but still we want * to provide a consistent cpumask to user-space: @@ -3186,11 +3181,12 @@ static ssize_t tracing_cpumask_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { + struct trace_array *tr = file_inode(filp)->i_private; int len; mutex_lock(&tracing_cpumask_update_lock); - len = cpumask_scnprintf(mask_str, count, tracing_cpumask); + len = cpumask_scnprintf(mask_str, count, tr->tracing_cpumask); if (count - len < 2) { count = -EINVAL; goto out_err; @@ -3208,7 +3204,7 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - struct trace_array *tr = filp->private_data; + struct trace_array *tr = file_inode(filp)->i_private; cpumask_var_t tracing_cpumask_new; int err, cpu; @@ -3228,12 +3224,12 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, * Increase/decrease the disabled counter if we are * about to flip a bit in the cpumask: */ - if (cpumask_test_cpu(cpu, tracing_cpumask) && + if (cpumask_test_cpu(cpu, tr->tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); } - if (!cpumask_test_cpu(cpu, tracing_cpumask) && + if (!cpumask_test_cpu(cpu, tr->tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); @@ -3242,7 +3238,7 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, arch_spin_unlock(&ftrace_max_lock); local_irq_enable(); - cpumask_copy(tracing_cpumask, tracing_cpumask_new); + cpumask_copy(tr->tracing_cpumask, tracing_cpumask_new); mutex_unlock(&tracing_cpumask_update_lock); free_cpumask_var(tracing_cpumask_new); @@ -3256,9 +3252,10 @@ err_unlock: } static const struct file_operations tracing_cpumask_fops = { - .open = tracing_open_generic, + .open = tracing_open_generic_tr, .read = tracing_cpumask_read, .write = tracing_cpumask_write, + .release = tracing_release_generic_tr, .llseek = generic_file_llseek, }; @@ -5938,6 +5935,11 @@ static int new_instance_create(const char *name) if (!tr->name) goto out_free_tr; + if (!alloc_cpumask_var(&tr->tracing_cpumask, GFP_KERNEL)) + goto out_free_tr; + + cpumask_copy(tr->tracing_cpumask, cpu_all_mask); + raw_spin_lock_init(&tr->start_lock); tr->current_trace = &nop_trace; @@ -5969,6 +5971,7 @@ static int new_instance_create(const char *name) out_free_tr: if (tr->trace_buffer.buffer) ring_buffer_free(tr->trace_buffer.buffer); + free_cpumask_var(tr->tracing_cpumask); kfree(tr->name); kfree(tr); @@ -6098,6 +6101,9 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; + trace_create_file("tracing_cpumask", 0644, d_tracer, + tr, &tracing_cpumask_fops); + trace_create_file("trace_options", 0644, d_tracer, tr, &tracing_iter_fops); @@ -6147,9 +6153,6 @@ static __init int tracer_init_debugfs(void) init_tracer_debugfs(&global_trace, d_tracer); - trace_create_file("tracing_cpumask", 0644, d_tracer, - &global_trace, &tracing_cpumask_fops); - trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); @@ -6371,7 +6374,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) + if (!alloc_cpumask_var(&global_trace.tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; /* Only allocate trace_printk buffers if a trace_printk exists */ @@ -6386,7 +6389,7 @@ __init static int tracer_alloc_buffers(void) ring_buf_size = 1; cpumask_copy(tracing_buffer_mask, cpu_possible_mask); - cpumask_copy(tracing_cpumask, cpu_all_mask); + cpumask_copy(global_trace.tracing_cpumask, cpu_all_mask); raw_spin_lock_init(&global_trace.start_lock); @@ -6441,7 +6444,7 @@ out_free_cpumask: #ifdef CONFIG_TRACER_MAX_TRACE free_percpu(global_trace.max_buffer.data); #endif - free_cpumask_var(tracing_cpumask); + free_cpumask_var(global_trace.tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); out: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index afaae41b0a02..502fed770751 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -206,6 +206,7 @@ struct trace_array { struct dentry *event_dir; struct list_head systems; struct list_head events; + cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; }; -- cgit v1.3-8-gc7d7 From 1a6661dafd2528d03d0eaed898ad596816dfe738 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 23 Aug 2013 14:24:41 -0700 Subject: workqueue: convert bus code to use dev_groups The dev_attrs field of struct bus_type is going away soon, dev_groups should be used instead. This converts the workqueue bus code to use the correct field. Acked-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/workqueue.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0b72e816b8d0..d1b5f0662651 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3081,25 +3081,26 @@ static struct workqueue_struct *dev_to_wq(struct device *dev) return wq_dev->wq; } -static ssize_t wq_per_cpu_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); } +static DEVICE_ATTR_RO(per_cpu); -static ssize_t wq_max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct workqueue_struct *wq = dev_to_wq(dev); return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); } -static ssize_t wq_max_active_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) { struct workqueue_struct *wq = dev_to_wq(dev); int val; @@ -3110,12 +3111,14 @@ static ssize_t wq_max_active_store(struct device *dev, workqueue_set_max_active(wq, val); return count; } +static DEVICE_ATTR_RW(max_active); -static struct device_attribute wq_sysfs_attrs[] = { - __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), - __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), - __ATTR_NULL, +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, }; +ATTRIBUTE_GROUPS(wq_sysfs); static ssize_t wq_pool_ids_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -3265,7 +3268,7 @@ static struct device_attribute wq_sysfs_unbound_attrs[] = { static struct bus_type wq_subsys = { .name = "workqueue", - .dev_attrs = wq_sysfs_attrs, + .dev_groups = wq_sysfs_groups, }; static int __init wq_sysfs_init(void) -- cgit v1.3-8-gc7d7 From 35cf083619da5677f83e9a8eae813f0b413d7082 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: rename cgroup_css_from_dir() to css_from_dir() and update its syntax cgroup_css_from_dir() will grow another user. In preparation, make the following changes. * All css functions are prefixed with just "css_", rename it to css_from_dir(). * Take dentry * instead of file * as dentry is what ultimately identifies a cgroup and file may not always be available. Note that the function now checkes whether @dentry->d_inode is NULL as the caller now may specify a negative dentry. * Make it take cgroup_subsys * instead of integer subsys_id. This simplifies the function and allows specifying no subsystem for cgroup->dummy_css. * Make return section a bit less verbose. This patch doesn't introduce any behavior changes. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar --- include/linux/cgroup.h | 3 ++- kernel/cgroup.c | 26 ++++++++++---------------- kernel/events/core.c | 2 +- 3 files changed, 13 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b685955d4b29..21ba29869eb8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -903,7 +903,8 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg, /* Get id and depth of css */ unsigned short css_id(struct cgroup_subsys_state *css); -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id); +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss); #else /* !CONFIG_CGROUPS */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef43e3f453ef..921b1387c944 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5700,34 +5700,28 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) EXPORT_SYMBOL_GPL(css_lookup); /** - * cgroup_css_from_dir - get corresponding css from file open on cgroup dir - * @f: directory file of interest - * @id: subsystem id of interest + * css_from_dir - get corresponding css from the dentry of a cgroup dir + * @dentry: directory dentry of interest + * @ss: subsystem of interest * * Must be called under RCU read lock. The caller is responsible for * pinning the returned css if it needs to be accessed outside the RCU * critical section. */ -struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) +struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, + struct cgroup_subsys *ss) { struct cgroup *cgrp; - struct inode *inode; - struct cgroup_subsys_state *css; WARN_ON_ONCE(!rcu_read_lock_held()); - inode = file_inode(f); - /* check in cgroup filesystem dir */ - if (inode->i_op != &cgroup_dir_inode_operations) + /* is @dentry a cgroup dir? */ + if (!dentry->d_inode || + dentry->d_inode->i_op != &cgroup_dir_inode_operations) return ERR_PTR(-EBADF); - if (id < 0 || id >= CGROUP_SUBSYS_COUNT) - return ERR_PTR(-EINVAL); - - /* get cgroup */ - cgrp = __d_cgrp(f->f_dentry); - css = cgroup_css(cgrp, id); - return css ? css : ERR_PTR(-ENOENT); + cgrp = __d_cgrp(dentry); + return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 23261f957713..b59ab6632f30 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -593,7 +593,7 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event, rcu_read_lock(); - css = cgroup_css_from_dir(f.file, perf_subsys_id); + css = css_from_dir(f.file->f_dentry, &perf_subsys); if (IS_ERR(css)) { ret = PTR_ERR(css); goto out; -- cgit v1.3-8-gc7d7 From ca8bdcaff0d77990fb69e0f946018c96a70851cc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys cgroup_css() is no longer used in hot paths. Make it take struct cgroup_subsys * and allow the users to specify NULL subsys to obtain the dummy_css. This removes open-coded NULL subsystem testing in a couple users and generally simplifies the code. After this patch, css_from_dir() also allows NULL @ss and returns the matching dummy_css. This behavior change doesn't affect its only user - perf. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 90 +++++++++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 47 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 921b1387c944..7516668d8325 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -226,19 +226,22 @@ static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[], /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest - * @subsys_id: the subsystem of interest + * @ss: the subsystem of interest (%NULL returns the dummy_css) * - * Return @cgrp's css (cgroup_subsys_state) associated with @subsys_id. - * This function must be called either under cgroup_mutex or - * rcu_read_lock() and the caller is responsible for pinning the returned - * css if it wants to keep accessing it outside the said locks. This - * function may return %NULL if @cgrp doesn't have @subsys_id enabled. + * Return @cgrp's css (cgroup_subsys_state) associated with @ss. This + * function must be called either under cgroup_mutex or rcu_read_lock() and + * the caller is responsible for pinning the returned css if it wants to + * keep accessing it outside the said locks. This function may return + * %NULL if @cgrp doesn't have @subsys_id enabled. */ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, - int subsys_id) + struct cgroup_subsys *ss) { - return rcu_dereference_check(cgrp->subsys[subsys_id], - lockdep_is_held(&cgroup_mutex)); + if (ss) + return rcu_dereference_check(cgrp->subsys[ss->subsys_id], + lockdep_is_held(&cgroup_mutex)); + else + return &cgrp->dummy_css; } /* convenient tests for these bits */ @@ -580,7 +583,7 @@ static struct css_set *find_existing_css_set(struct css_set *old_cset, /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ - template[i] = cgroup_css(cgrp, i); + template[i] = cgroup_css(cgrp, ss); } else { /* Subsystem is not in this hierarchy, so we * don't want to change the subsystem state */ @@ -1062,30 +1065,30 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (bit & added_mask) { /* We're binding this subsystem to this hierarchy */ - BUG_ON(cgroup_css(cgrp, i)); - BUG_ON(!cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgroup_dummy_top, i)->cgroup != cgroup_dummy_top); + BUG_ON(cgroup_css(cgrp, ss)); + BUG_ON(!cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgroup_dummy_top, ss)->cgroup != cgroup_dummy_top); rcu_assign_pointer(cgrp->subsys[i], - cgroup_css(cgroup_dummy_top, i)); - cgroup_css(cgrp, i)->cgroup = cgrp; + cgroup_css(cgroup_dummy_top, ss)); + cgroup_css(cgrp, ss)->cgroup = cgrp; list_move(&ss->sibling, &root->subsys_list); ss->root = root; if (ss->bind) - ss->bind(cgroup_css(cgrp, i)); + ss->bind(cgroup_css(cgrp, ss)); /* refcount was already taken, and we're keeping it */ root->subsys_mask |= bit; } else if (bit & removed_mask) { /* We're removing this subsystem */ - BUG_ON(cgroup_css(cgrp, i) != cgroup_css(cgroup_dummy_top, i)); - BUG_ON(cgroup_css(cgrp, i)->cgroup != cgrp); + BUG_ON(cgroup_css(cgrp, ss) != cgroup_css(cgroup_dummy_top, ss)); + BUG_ON(cgroup_css(cgrp, ss)->cgroup != cgrp); if (ss->bind) - ss->bind(cgroup_css(cgroup_dummy_top, i)); + ss->bind(cgroup_css(cgroup_dummy_top, ss)); - cgroup_css(cgroup_dummy_top, i)->cgroup = cgroup_dummy_top; + cgroup_css(cgroup_dummy_top, ss)->cgroup = cgroup_dummy_top; RCU_INIT_POINTER(cgrp->subsys[i], NULL); cgroup_subsys[i]->root = &cgroup_dummy_root; @@ -1930,7 +1933,7 @@ EXPORT_SYMBOL_GPL(cgroup_taskset_next); struct cgroup_subsys_state *cgroup_taskset_cur_css(struct cgroup_taskset *tset, int subsys_id) { - return cgroup_css(tset->cur_cgrp, subsys_id); + return cgroup_css(tset->cur_cgrp, cgroup_subsys[subsys_id]); } EXPORT_SYMBOL_GPL(cgroup_taskset_cur_css); @@ -2071,7 +2074,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 1: check that we can legitimately attach to the cgroup. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->can_attach) { retval = ss->can_attach(css, &tset); @@ -2113,7 +2116,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, * step 4: do subsystem attach callbacks. */ for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss->attach) ss->attach(css, &tset); @@ -2135,7 +2138,7 @@ out_put_css_set_refs: out_cancel_attach: if (retval) { for_each_root_subsys(root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); if (ss == failed_ss) break; @@ -2481,13 +2484,9 @@ static int cgroup_file_open(struct inode *inode, struct file *file) * @css stays alive for all file operations. */ rcu_read_lock(); - if (cft->ss) { - css = cgroup_css(cgrp, cft->ss->subsys_id); - if (!css_tryget(css)) - css = NULL; - } else { - css = &cgrp->dummy_css; - } + css = cgroup_css(cgrp, cft->ss); + if (cft->ss && !css_tryget(css)) + css = NULL; rcu_read_unlock(); if (!css) @@ -2878,7 +2877,7 @@ static int cgroup_cfts_commit(struct cftype *cfts, bool is_add) /* add/rm files for all cgroups created before */ rcu_read_lock(); - css_for_each_descendant_pre(css, cgroup_css(root, ss->subsys_id)) { + css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; if (cgroup_is_dead(cgrp)) @@ -3082,10 +3081,7 @@ css_next_child(struct cgroup_subsys_state *pos_css, if (&next->sibling == &cgrp->children) return NULL; - if (parent_css->ss) - return cgroup_css(next, parent_css->ss->subsys_id); - else - return &next->dummy_css; + return cgroup_css(next, parent_css->ss); } EXPORT_SYMBOL_GPL(css_next_child); @@ -4110,7 +4106,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, rcu_read_lock(); ret = -EINVAL; - event->css = cgroup_css(cgrp, event->cft->ss->subsys_id); + event->css = cgroup_css(cgrp, event->cft->ss); if (event->css) ret = 0; @@ -4266,7 +4262,7 @@ static int cgroup_populate_dir(struct cgroup *cgrp, unsigned long subsys_mask) /* This cgroup is ready now */ for_each_root_subsys(cgrp->root, ss) { - struct cgroup_subsys_state *css = cgroup_css(cgrp, ss->subsys_id); + struct cgroup_subsys_state *css = cgroup_css(cgrp, ss); struct css_id *id = rcu_dereference_protected(css->id, true); /* @@ -4349,11 +4345,11 @@ static void init_css(struct cgroup_subsys_state *css, struct cgroup_subsys *ss, css->id = NULL; if (cgrp->parent) - css->parent = cgroup_css(cgrp->parent, ss->subsys_id); + css->parent = cgroup_css(cgrp->parent, ss); else css->flags |= CSS_ROOT; - BUG_ON(cgroup_css(cgrp, ss->subsys_id)); + BUG_ON(cgroup_css(cgrp, ss)); } /* invoke ->css_online() on a new CSS and mark it online if successful */ @@ -4466,7 +4462,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_root_subsys(root, ss) { struct cgroup_subsys_state *css; - css = ss->css_alloc(cgroup_css(parent, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(parent, ss)); if (IS_ERR(css)) { err = PTR_ERR(css); goto err_free_all; @@ -4712,7 +4708,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) * percpu refs of all css's are confirmed to be killed. */ for_each_root_subsys(cgrp->root, ss) - kill_css(cgroup_css(cgrp, ss->subsys_id)); + kill_css(cgroup_css(cgrp, ss)); /* * Mark @cgrp dead. This prevents further task migration and child @@ -4839,7 +4835,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) /* Create the top cgroup state for this subsystem */ list_add(&ss->sibling, &cgroup_dummy_root.subsys_list); ss->root = &cgroup_dummy_root; - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_css(css, ss, cgroup_dummy_top); @@ -4918,7 +4914,7 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) * struct, so this can happen first (i.e. before the dummy root * attachment). */ - css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + css = ss->css_alloc(cgroup_css(cgroup_dummy_top, ss)); if (IS_ERR(css)) { /* failure case - need to deassign the cgroup_subsys[] slot. */ cgroup_subsys[ss->subsys_id] = NULL; @@ -5000,7 +4996,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) mutex_lock(&cgroup_mutex); - offline_css(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + offline_css(cgroup_css(cgroup_dummy_top, ss)); if (ss->use_id) idr_destroy(&ss->idr); @@ -5034,7 +5030,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss) * the cgrp->subsys pointer to find their state. note that this * also takes care of freeing the css_id. */ - ss->css_free(cgroup_css(cgroup_dummy_top, ss->subsys_id)); + ss->css_free(cgroup_css(cgroup_dummy_top, ss)); RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL); mutex_unlock(&cgroup_mutex); @@ -5721,7 +5717,7 @@ struct cgroup_subsys_state *css_from_dir(struct dentry *dentry, return ERR_PTR(-EBADF); cgrp = __d_cgrp(dentry); - return cgroup_css(cgrp, ss->subsys_id) ?: ERR_PTR(-ENOENT); + return cgroup_css(cgrp, ss) ?: ERR_PTR(-ENOENT); } /** -- cgit v1.3-8-gc7d7 From 9fa4db334c7d9570aec7a5121e84fae99aae1d04 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: implement CFTYPE_NO_PREFIX When cgroup files are created, cgroup core automatically prepends the name of the subsystem as prefix. This patch adds CFTYPE_NO_ which disables the automatic prefix. This is to work around historical baggages and shouldn't be used for new files. This will be used to move "cgroup.event_control" from cgroup core to memcg. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov Cc: Glauber Costa --- include/linux/cgroup.h | 1 + kernel/cgroup.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 21ba29869eb8..3561d305b1e0 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -411,6 +411,7 @@ enum { CFTYPE_ONLY_ON_ROOT = (1 << 0), /* only create on root cgrp */ CFTYPE_NOT_ON_ROOT = (1 << 1), /* don't create on root cgrp */ CFTYPE_INSANE = (1 << 2), /* don't create if sane_behavior */ + CFTYPE_NO_PREFIX = (1 << 3), /* (DON'T USE FOR NEW FILES) no subsys prefix */ }; #define MAX_CFTYPE_NAME 64 diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 7516668d8325..a41dc87cd07e 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2756,7 +2756,8 @@ static int cgroup_add_file(struct cgroup *cgrp, struct cftype *cft) umode_t mode; char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; - if (cft->ss && !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { + if (cft->ss && !(cft->flags & CFTYPE_NO_PREFIX) && + !(cgrp->root->flags & CGRP_ROOT_NOPREFIX)) { strcpy(name, cft->ss->name); strcat(name, "."); } -- cgit v1.3-8-gc7d7 From 7941cb027dccedec3c047271554ddcf4be2e0697 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_event hold onto cgroup_subsys_state instead of cgroup Currently, each registered cgroup_event holds an extra reference to the cgroup. This is a bit weird as events are subsystem specific and will also be incorrect in the planned unified hierarchy as css (cgroup_subsys_state) may come and go dynamically across the lifetime of a cgroup. Holding onto cgroup won't prevent the target css from going away. Update cgroup_event to hold onto the css the traget file belongs to instead of cgroup. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a41dc87cd07e..12237a291d88 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3969,7 +3969,6 @@ static void cgroup_event_remove(struct work_struct *work) struct cgroup_event *event = container_of(work, struct cgroup_event, remove); struct cgroup_subsys_state *css = event->css; - struct cgroup *cgrp = css->cgroup; remove_wait_queue(event->wqh, &event->wait); @@ -3980,7 +3979,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); - cgroup_dput(cgrp); + css_put(css); } /* @@ -4103,12 +4102,16 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, goto out_put_cfile; } - /* determine the css of @cfile and associate @event with it */ + /* + * Determine the css of @cfile and associate @event with it. + * Remaining events are automatically removed on cgroup destruction + * but the removal is asynchronous, so take an extra ref. + */ rcu_read_lock(); ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css) + if (event->css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); @@ -4122,28 +4125,21 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); if (cgrp_cfile != cgrp) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; - goto out_put_cfile; + goto out_put_css; } ret = event->cft->register_event(event->css, event->cft, event->eventfd, buffer); if (ret) - goto out_put_cfile; + goto out_put_css; efile->f_op->poll(efile, &event->pt); - /* - * Events should be removed after rmdir of cgroup directory, but before - * destroying subsystem state objects. Let's take reference to cgroup - * directory dentry to do that. - */ - dget(cgrp->dentry); - spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); @@ -4153,6 +4149,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, return 0; +out_put_css: + css_put(event->css); out_put_cfile: fput(cfile); out_put_eventfd: -- cgit v1.3-8-gc7d7 From 7c918cbbd829669bf70ffcc45962d5d992942243 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 26 Aug 2013 18:40:56 -0400 Subject: cgroup: make cgroup_write_event_control() use css_from_dir() instead of __d_cgrp() cgroup_event will be moved to its only user - memcg. Replace __d_cgrp() usage with css_from_dir(), which is already exported. This also simplifies the code a bit. Signed-off-by: Tejun Heo Acked-by: Li Zefan Acked-by: Kirill A. Shutemov --- kernel/cgroup.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 12237a291d88..e76698dd6c08 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4041,7 +4041,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, { struct cgroup *cgrp = dummy_css->cgroup; struct cgroup_event *event; - struct cgroup *cgrp_cfile; + struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; struct file *efile; struct file *cfile; @@ -4103,7 +4103,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, } /* - * Determine the css of @cfile and associate @event with it. + * Determine the css of @cfile, verify it belongs to the same + * cgroup as cgroup.event_control, and associate @event with it. * Remaining events are automatically removed on cgroup destruction * but the removal is asynchronous, so take an extra ref. */ @@ -4111,23 +4112,14 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - if (event->css && css_tryget(event->css)) + cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + if (event->css && event->css == cfile_css && css_tryget(event->css)) ret = 0; rcu_read_unlock(); if (ret) goto out_put_cfile; - /* - * The file to be monitored must be in the same cgroup as - * cgroup.event_control is. - */ - cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); - if (cgrp_cfile != cgrp) { - ret = -EINVAL; - goto out_put_css; - } - if (!event->cft->register_event || !event->cft->unregister_event) { ret = -EINVAL; goto out_put_css; -- cgit v1.3-8-gc7d7 From 21e851943e31022731cd5fad386ca8fb552dbe64 Mon Sep 17 00:00:00 2001 From: "Raphael S.Carvalho" Date: Wed, 27 Feb 2013 15:32:09 -0300 Subject: kernel/nsproxy.c: Improving a snippet of code. It seems GCC generates a better code in that way, so I changed that statement. Btw, they have the same semantic, so I'm sending this patch due to performance issues. Acked-by: Serge E. Hallyn Signed-off-by: Raphael S.Carvalho Signed-off-by: Eric W. Biederman --- kernel/nsproxy.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 364ceab15f0c..d9afd256318f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -148,7 +148,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) * means share undolist with parent, so we must forbid using * it along with CLONE_NEWIPC. */ - if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) { + if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == + (CLONE_NEWIPC | CLONE_SYSVSEM)) { err = -EINVAL; goto out; } -- cgit v1.3-8-gc7d7 From e51db73532955dc5eaba4235e62b74b460709d5b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 30 Mar 2013 19:57:41 -0700 Subject: userns: Better restrictions on when proc and sysfs can be mounted Rely on the fact that another flavor of the filesystem is already mounted and do not rely on state in the user namespace. Verify that the mounted filesystem is not covered in any significant way. I would love to verify that the previously mounted filesystem has no mounts on top but there are at least the directories /proc/sys/fs/binfmt_misc and /sys/fs/cgroup/ that exist explicitly for other filesystems to mount on top of. Refactor the test into a function named fs_fully_visible and call that function from the mount routines of proc and sysfs. This makes this test local to the filesystems involved and the results current of when the mounts take place, removing a weird threading of the user namespace, the mount namespace and the filesystems themselves. Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 37 +++++++++++++++++++++++++------------ fs/proc/root.c | 7 +++++-- fs/sysfs/mount.c | 3 ++- include/linux/fs.h | 1 + include/linux/user_namespace.h | 4 ---- kernel/user.c | 2 -- kernel/user_namespace.c | 2 -- 7 files changed, 33 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 64627f883bf2..877e4277f496 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2867,25 +2867,38 @@ bool current_chrooted(void) return chrooted; } -void update_mnt_policy(struct user_namespace *userns) +bool fs_fully_visible(struct file_system_type *type) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt; + bool visible = false; - down_read(&namespace_sem); + if (unlikely(!ns)) + return false; + + namespace_lock(); list_for_each_entry(mnt, &ns->list, mnt_list) { - switch (mnt->mnt.mnt_sb->s_magic) { - case SYSFS_MAGIC: - userns->may_mount_sysfs = true; - break; - case PROC_SUPER_MAGIC: - userns->may_mount_proc = true; - break; + struct mount *child; + if (mnt->mnt.mnt_sb->s_type != type) + continue; + + /* This mount is not fully visible if there are any child mounts + * that cover anything except for empty directories. + */ + list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { + struct inode *inode = child->mnt_mountpoint->d_inode; + if (!S_ISDIR(inode->i_mode)) + goto next; + if (inode->i_nlink != 2) + goto next; } - if (userns->may_mount_sysfs && userns->may_mount_proc) - break; + visible = true; + goto found; + next: ; } - up_read(&namespace_sem); +found: + namespace_unlock(); + return visible; } static void *mntns_get(struct task_struct *task) diff --git a/fs/proc/root.c b/fs/proc/root.c index 38bd5d423fcd..45e5fb7da09b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -110,8 +110,11 @@ static struct dentry *proc_mount(struct file_system_type *fs_type, ns = task_active_pid_ns(current); options = data; - if (!current_user_ns()->may_mount_proc || - !ns_capable(ns->user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN) && !fs_fully_visible(fs_type)) + return ERR_PTR(-EPERM); + + /* Does the mounter have privilege over the pid namespace? */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index afd83273e6ce..4a2da3a4b1b1 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -112,7 +112,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type, struct super_block *sb; int error; - if (!(flags & MS_KERNMOUNT) && !current_user_ns()->may_mount_sysfs) + if (!(flags & MS_KERNMOUNT) && !capable(CAP_SYS_ADMIN) && + !fs_fully_visible(fs_type)) return ERR_PTR(-EPERM); info = kzalloc(sizeof(*info), GFP_KERNEL); diff --git a/include/linux/fs.h b/include/linux/fs.h index 981874773e85..3050c620f062 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1897,6 +1897,7 @@ extern int vfs_ustat(dev_t, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); extern bool our_mnt(struct vfsmount *mnt); +extern bool fs_fully_visible(struct file_system_type *); extern int current_umask(void); diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index b6b215f13b45..4ce009324933 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -26,8 +26,6 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; - bool may_mount_sysfs; - bool may_mount_proc; }; extern struct user_namespace init_user_ns; @@ -84,6 +82,4 @@ static inline void put_user_ns(struct user_namespace *ns) #endif -void update_mnt_policy(struct user_namespace *userns); - #endif /* _LINUX_USER_H */ diff --git a/kernel/user.c b/kernel/user.c index 69b4c3d48cde..5bbb91988e69 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,8 +51,6 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, - .may_mount_sysfs = true, - .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index d8c30db06c5b..d58ad1e7a794 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -97,8 +97,6 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); - update_mnt_policy(ns); - return 0; } -- cgit v1.3-8-gc7d7 From d1625964da51bda61306ad3ec45307a799c21f08 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 27 Aug 2013 14:27:23 -0400 Subject: cgroup: fix cgroup_css() invocation in css_from_id() ca8bdcaff0 ("cgroup: make cgroup_css() take cgroup_subsys * instead and allow NULL subsys") missed one conversion in css_from_id(), which was newly added. As css_from_id() doesn't have any user yet, this doesn't break anything other than generating a build warning. Convert it. Signed-off-by: Tejun Heo Reported-by: Stephen Rothwell Reported-by: kbuild test robot --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e76698dd6c08..b5f4989937f2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5729,7 +5729,7 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) cgrp = idr_find(&ss->root->cgroup_idr, id); if (cgrp) - return cgroup_css(cgrp, ss->subsys_id); + return cgroup_css(cgrp, ss); return NULL; } -- cgit v1.3-8-gc7d7 From 9c823f9f7e4b392921d0d8b251bec080d58f9077 Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Thu, 22 Aug 2013 06:43:37 +0000 Subject: padata - share code between CPU_ONLINE and CPU_DOWN_FAILED, same to CPU_DOWN_PREPARE and CPU_UP_CANCELED Share code between CPU_ONLINE and CPU_DOWN_FAILED, same to CPU_DOWN_PREPARE and CPU_UP_CANCELED. It will fix 2 bugs: "not check the return value of __padata_remove_cpu() and __padata_add_cpu()". "need add 'break' between CPU_UP_CANCELED and CPU_DOWN_FAILED". Signed-off-by: Chen Gang Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 072f4ee4eb89..2f0037a86289 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -846,6 +846,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); @@ -857,6 +859,8 @@ static int padata_cpu_callback(struct notifier_block *nfb, case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!pinst_has_cpu(pinst, cpu)) break; mutex_lock(&pinst->lock); @@ -865,22 +869,6 @@ static int padata_cpu_callback(struct notifier_block *nfb, if (err) return notifier_from_errno(err); break; - - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - __padata_remove_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); - - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - if (!pinst_has_cpu(pinst, cpu)) - break; - mutex_lock(&pinst->lock); - __padata_add_cpu(pinst, cpu); - mutex_unlock(&pinst->lock); } return NOTIFY_OK; -- cgit v1.3-8-gc7d7 From b8b4a4166e3401b7d8ea9deb8d64d875a468144c Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 23 Aug 2013 13:12:33 +0200 Subject: padata - Register hotcpu notifier after initialization padata_cpu_callback() takes pinst->lock, to avoid taking an uninitialized lock, register the notifier after it's initialization. Signed-off-by: Richard Weinberger Acked-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 2f0037a86289..07af2c95dcfe 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1074,18 +1074,18 @@ struct padata_instance *padata_alloc(struct workqueue_struct *wq, pinst->flags = 0; -#ifdef CONFIG_HOTPLUG_CPU - pinst->cpu_notifier.notifier_call = padata_cpu_callback; - pinst->cpu_notifier.priority = 0; - register_hotcpu_notifier(&pinst->cpu_notifier); -#endif - put_online_cpus(); BLOCKING_INIT_NOTIFIER_HEAD(&pinst->cpumask_change_notifier); kobject_init(&pinst->kobj, &padata_attr_type); mutex_init(&pinst->lock); +#ifdef CONFIG_HOTPLUG_CPU + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + register_hotcpu_notifier(&pinst->cpu_notifier); +#endif + return pinst; err_free_masks: -- cgit v1.3-8-gc7d7 From ff3d527cebc1fa3707c617bfe9e74f53fcfb0955 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 27 Aug 2013 11:23:07 +0300 Subject: perf: make events stream always parsable The event stream is not always parsable because the format of a sample is dependent on the sample_type of the selected event. When there is more than one selected event and the sample_types are not the same then parsing becomes problematic. A sample can be matched to its selected event using the ID that is allocated when the event is opened. Unfortunately, to get the ID from the sample means first parsing it. This patch adds a new sample format bit PERF_SAMPLE_IDENTIFER that puts the ID at a fixed position so that the ID can be retrieved without parsing the sample. For sample events, that is the first position immediately after the header. For non-sample events, that is the last position. In this respect parsing samples requires that the sample_type and ID values are recorded. For example, perf tools records struct perf_event_attr and the IDs within the perf.data file. Those must be read first before it is possible to parse samples found later in the perf.data file. Signed-off-by: Adrian Hunter Tested-by: Stephane Eranian Acked-by: Peter Zijlstra Cc: David Ahern Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mike Galbraith Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1377591794-30553-6-git-send-email-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- include/uapi/linux/perf_event.h | 27 ++++++++++++++++++++------- kernel/events/core.c | 11 ++++++++++- 2 files changed, 30 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 62c25a25291c..42cb7b62ca59 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -134,8 +134,9 @@ enum perf_event_sample_format { PERF_SAMPLE_STACK_USER = 1U << 13, PERF_SAMPLE_WEIGHT = 1U << 14, PERF_SAMPLE_DATA_SRC = 1U << 15, + PERF_SAMPLE_IDENTIFIER = 1U << 16, - PERF_SAMPLE_MAX = 1U << 16, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 17, /* non-ABI */ }; /* @@ -492,12 +493,12 @@ enum perf_event_type { /* * If perf_event_attr.sample_id_all is set then all event types will * have the sample_type selected fields related to where/when - * (identity) an event took place (TID, TIME, ID, CPU, STREAM_ID) - * described in PERF_RECORD_SAMPLE below, it will be stashed just after - * the perf_event_header and the fields already present for the existing - * fields, i.e. at the end of the payload. That way a newer perf.data - * file will be supported by older perf tools, with these new optional - * fields being ignored. + * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, + * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed + * just after the perf_event_header and the fields already present for + * the existing fields, i.e. at the end of the payload. That way a newer + * perf.data file will be supported by older perf tools, with these new + * optional fields being ignored. * * struct sample_id { * { u32 pid, tid; } && PERF_SAMPLE_TID @@ -505,7 +506,12 @@ enum perf_event_type { * { u64 id; } && PERF_SAMPLE_ID * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 id; } && PERF_SAMPLE_IDENTIFIER * } && perf_event_attr::sample_id_all + * + * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The + * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed + * relative to header.size. */ /* @@ -594,6 +600,13 @@ enum perf_event_type { * struct { * struct perf_event_header header; * + * # + * # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. + * # The advantage of PERF_SAMPLE_IDENTIFIER is that its position + * # is fixed relative to header. + * # + * + * { u64 id; } && PERF_SAMPLE_IDENTIFIER * { u64 ip; } && PERF_SAMPLE_IP * { u32 pid, tid; } && PERF_SAMPLE_TID * { u64 time; } && PERF_SAMPLE_TIME diff --git a/kernel/events/core.c b/kernel/events/core.c index 928fae7ca8c7..15d0f2418e54 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1213,6 +1213,9 @@ static void perf_event__id_header_size(struct perf_event *event) if (sample_type & PERF_SAMPLE_TIME) size += sizeof(data->time); + if (sample_type & PERF_SAMPLE_IDENTIFIER) + size += sizeof(data->id); + if (sample_type & PERF_SAMPLE_ID) size += sizeof(data->id); @@ -4280,7 +4283,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_TIME) data->time = perf_clock(); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); if (sample_type & PERF_SAMPLE_STREAM_ID) @@ -4319,6 +4322,9 @@ static void __perf_event__output_id_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CPU) perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_IDENTIFIER) + perf_output_put(handle, data->id); } void perf_event__output_id_sample(struct perf_event *event, @@ -4432,6 +4438,9 @@ void perf_output_sample(struct perf_output_handle *handle, perf_output_put(handle, *header); + if (sample_type & PERF_SAMPLE_IDENTIFIER) + perf_output_put(handle, data->id); + if (sample_type & PERF_SAMPLE_IP) perf_output_put(handle, data->ip); -- cgit v1.3-8-gc7d7 From a606488513543312805fab2b93070cefe6a3016c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 29 Aug 2013 13:56:50 -0700 Subject: pidns: Fix hang in zap_pid_ns_processes by sending a potentially extra wakeup Serge Hallyn writes: > Since commit af4b8a83add95ef40716401395b44a1b579965f4 it's been > possible to get into a situation where a pidns reaper is > , reparented to host pid 1, but never reaped. How to > reproduce this is documented at > > https://bugs.launchpad.net/ubuntu/+source/lxc/+bug/1168526 > (and see > https://bugs.launchpad.net/ubuntu/+source/lxc/+bug/1168526/comments/13) > In short, run repeated starts of a container whose init is > > Process.exit(0); > > sysrq-t when such a task is playing zombie shows: > > [ 131.132978] init x ffff88011fc14580 0 2084 2039 0x00000000 > [ 131.132978] ffff880116e89ea8 0000000000000002 ffff880116e89fd8 0000000000014580 > [ 131.132978] ffff880116e89fd8 0000000000014580 ffff8801172a0000 ffff8801172a0000 > [ 131.132978] ffff8801172a0630 ffff88011729fff0 ffff880116e14650 ffff88011729fff0 > [ 131.132978] Call Trace: > [ 131.132978] [] schedule+0x29/0x70 > [ 131.132978] [] do_exit+0x6e1/0xa40 > [ 131.132978] [] ? signal_wake_up_state+0x1e/0x30 > [ 131.132978] [] do_group_exit+0x3f/0xa0 > [ 131.132978] [] SyS_exit_group+0x14/0x20 > [ 131.132978] [] tracesys+0xe1/0xe6 > > Further debugging showed that every time this happened, zap_pid_ns_processes() > started with nr_hashed being 3, while we were expecting it to drop to 2. > Any time it didn't happen, nr_hashed was 1 or 2. So the reaper was > waiting for nr_hashed to become 2, but free_pid() only wakes the reaper > if nr_hashed hits 1. The issue is that when the task group leader of an init process exits before other tasks of the init process when the init process finally exits it will be a secondary task sleeping in zap_pid_ns_processes and waiting to wake up when the number of hashed pids drops to two. This case waits forever as free_pid only sends a wake up when the number of hashed pids drops to 1. To correct this the simple strategy of sending a possibly unncessary wake up when the number of hashed pids drops to 2 is adopted. Sending one extraneous wake up is relatively harmless, at worst we waste a little cpu time in the rare case when a pid namespace appropaches exiting. We can detect the case when the pid namespace drops to just two pids hashed race free in free_pid. Dereferencing pid_ns->child_reaper with the pidmap_lock held is safe without out the tasklist_lock because it is guaranteed that the detach_pid will be called on the child_reaper before it is freed and detach_pid calls __change_pid which calls free_pid which takes the pidmap_lock. __change_pid only calls free_pid if this is the last use of the pid. For a thread that is not the thread group leader the threads pid will only ever have one user because a threads pid is not allowed to be the pid of a process, of a process group or a session. For a thread that is a thread group leader all of the other threads of that process will be reaped before it is allowed for the thread group leader to be reaped ensuring there will only be one user of the threads pid as a process pid. Furthermore because the thread is the init process of a pid namespace all of the other processes in the pid namespace will have also been already freed leading to the fact that the pid will not be used as a session pid or a process group pid for any other running process. CC: stable@vger.kernel.org Acked-by: Serge Hallyn Tested-by: Serge Hallyn Reported-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/pid.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index 66505c1dfc51..ebe5e80b10f8 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -265,6 +265,7 @@ void free_pid(struct pid *pid) struct pid_namespace *ns = upid->ns; hlist_del_rcu(&upid->pid_chain); switch(--ns->nr_hashed) { + case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper -- cgit v1.3-8-gc7d7 From dbef0c1c4c5f8ce5d1f5bd8cee092a7afb4ac21b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 9 Mar 2013 16:15:23 -0800 Subject: namespaces: Simplify copy_namespaces so it is clear what is going on. Remove the test for the impossible case where tsk->nsproxy == NULL. Fork will never be called with tsk->nsproxy == NULL. Only call get_nsproxy when we don't need to generate a new_nsproxy, and mark the case where we don't generate a new nsproxy as likely. Remove the code to drop an unnecessarily acquired nsproxy value. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/nsproxy.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index d9afd256318f..a1ed01139276 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -125,22 +125,16 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *old_ns = tsk->nsproxy; struct user_namespace *user_ns = task_cred_xxx(tsk, user_ns); struct nsproxy *new_ns; - int err = 0; - if (!old_ns) + if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | + CLONE_NEWPID | CLONE_NEWNET)))) { + get_nsproxy(old_ns); return 0; - - get_nsproxy(old_ns); - - if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET))) - return 0; - - if (!ns_capable(user_ns, CAP_SYS_ADMIN)) { - err = -EPERM; - goto out; } + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return -EPERM; + /* * CLONE_NEWIPC must detach from the undolist: after switching * to a new ipc namespace, the semaphore arrays from the old @@ -149,22 +143,15 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) * it along with CLONE_NEWIPC. */ if ((flags & (CLONE_NEWIPC | CLONE_SYSVSEM)) == - (CLONE_NEWIPC | CLONE_SYSVSEM)) { - err = -EINVAL; - goto out; - } + (CLONE_NEWIPC | CLONE_SYSVSEM)) + return -EINVAL; new_ns = create_new_namespaces(flags, tsk, user_ns, tsk->fs); - if (IS_ERR(new_ns)) { - err = PTR_ERR(new_ns); - goto out; - } + if (IS_ERR(new_ns)) + return PTR_ERR(new_ns); tsk->nsproxy = new_ns; - -out: - put_nsproxy(old_ns); - return err; + return 0; } void free_nsproxy(struct nsproxy *ns) -- cgit v1.3-8-gc7d7 From 8fd37a4c9822d58c93f764864582aa13112b1513 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Aug 2013 14:19:38 +0200 Subject: PM / hibernate: Create memory bitmaps after freezing user space The hibernation core uses special memory bitmaps during image creation and restoration and traditionally those bitmaps are allocated before freezing tasks, because in the past GFP_KERNEL allocations might not work after all tasks had been frozen. However, this is an anachronism, because hibernation_snapshot() now calls hibernate_preallocate_memory() which allocates memory for the image upfront anyway, so the memory bitmaps may be allocated after freezing user space safely. For this reason, move all of the create_basic_memory_bitmaps() calls after freeze_processes() and all of the corresponding free_basic_memory_bitmaps() calls before thaw_processes(). This will allow us to hold device_hotplug_lock around hibernation without the need to worry about freezing issues with user space processes attempting to acquire it via sysfs attributes after the creation of memory bitmaps and before the freezing of tasks. Signed-off-by: Rafael J. Wysocki Acked-by: Toshi Kani --- kernel/power/hibernate.c | 41 +++++++++++++++++++---------------------- kernel/power/user.c | 22 ++++++++++++---------- 2 files changed, 31 insertions(+), 32 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b26f5f1e773e..d4e54053d009 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -644,22 +644,22 @@ int hibernate(void) if (error) goto Exit; - /* Allocate memory management structures */ - error = create_basic_memory_bitmaps(); - if (error) - goto Exit; - printk(KERN_INFO "PM: Syncing filesystems ... "); sys_sync(); printk("done.\n"); error = freeze_processes(); if (error) - goto Free_bitmaps; + goto Exit; + + /* Allocate memory management structures */ + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); if (error || freezer_test_done) - goto Thaw; + goto Free_bitmaps; if (in_suspend) { unsigned int flags = 0; @@ -682,14 +682,13 @@ int hibernate(void) pr_debug("PM: Image restored successfully.\n"); } + Free_bitmaps: + free_basic_memory_bitmaps(); Thaw: thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ freezer_test_done = false; - - Free_bitmaps: - free_basic_memory_bitmaps(); Exit: pm_notifier_call_chain(PM_POST_HIBERNATION); pm_restore_console(); @@ -806,21 +805,19 @@ static int software_resume(void) pm_prepare_console(); error = pm_notifier_call_chain(PM_RESTORE_PREPARE); if (error) - goto close_finish; - - error = create_basic_memory_bitmaps(); - if (error) - goto close_finish; + goto Close_Finish; pr_debug("PM: Preparing processes for restore.\n"); error = freeze_processes(); - if (error) { - swsusp_close(FMODE_READ); - goto Done; - } + if (error) + goto Close_Finish; pr_debug("PM: Loading hibernation image.\n"); + error = create_basic_memory_bitmaps(); + if (error) + goto Thaw; + error = swsusp_read(&flags); swsusp_close(FMODE_READ); if (!error) @@ -828,9 +825,9 @@ static int software_resume(void) printk(KERN_ERR "PM: Failed to load hibernation image, recovering.\n"); swsusp_free(); - thaw_processes(); - Done: free_basic_memory_bitmaps(); + Thaw: + thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); pm_restore_console(); @@ -840,7 +837,7 @@ static int software_resume(void) mutex_unlock(&pm_mutex); pr_debug("PM: Hibernation image not present or could not be loaded.\n"); return error; -close_finish: + Close_Finish: swsusp_close(FMODE_READ); goto Finish; } diff --git a/kernel/power/user.c b/kernel/power/user.c index 4ed81e74f86f..63368163e98d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -60,11 +60,6 @@ static int snapshot_open(struct inode *inode, struct file *filp) error = -ENOSYS; goto Unlock; } - if(create_basic_memory_bitmaps()) { - atomic_inc(&snapshot_device_available); - error = -ENOMEM; - goto Unlock; - } nonseekable_open(inode, filp); data = &snapshot_state; filp->private_data = data; @@ -90,10 +85,9 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_RESTORE); } - if (error) { - free_basic_memory_bitmaps(); + if (error) atomic_inc(&snapshot_device_available); - } + data->frozen = 0; data->ready = 0; data->platform_support = 0; @@ -111,11 +105,11 @@ static int snapshot_release(struct inode *inode, struct file *filp) lock_system_sleep(); swsusp_free(); - free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); if (data->frozen) { pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); } pm_notifier_call_chain(data->mode == O_RDONLY ? @@ -220,14 +214,22 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, printk("done.\n"); error = freeze_processes(); - if (!error) + if (error) + break; + + error = create_basic_memory_bitmaps(); + if (error) + thaw_processes(); + else data->frozen = 1; + break; case SNAPSHOT_UNFREEZE: if (!data->frozen || data->ready) break; pm_restore_gfp_mask(); + free_basic_memory_bitmaps(); thaw_processes(); data->frozen = 0; break; -- cgit v1.3-8-gc7d7 From 942f40155a743f4204308d62405dacaa4bfadb11 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 30 Aug 2013 14:19:46 +0200 Subject: PM / hibernate / memory hotplug: Rework mutual exclusion Since all of the memory hotplug operations have to be carried out under device_hotplug_lock, they won't need to acquire pm_mutex if device_hotplug_lock is held around hibernation. For this reason, make the hibernation code acquire device_hotplug_lock after freezing user space processes and release it before thawing them. At the same tim drop the lock_system_sleep() and unlock_system_sleep() calls from lock_memory_hotplug() and unlock_memory_hotplug(), respectively. Signed-off-by: Rafael J. Wysocki Acked-by: Toshi Kani --- kernel/power/hibernate.c | 4 ++++ kernel/power/user.c | 2 ++ mm/memory_hotplug.c | 4 ---- 3 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index d4e54053d009..0b78f72ad39d 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -652,6 +652,7 @@ int hibernate(void) if (error) goto Exit; + lock_device_hotplug(); /* Allocate memory management structures */ error = create_basic_memory_bitmaps(); if (error) @@ -685,6 +686,7 @@ int hibernate(void) Free_bitmaps: free_basic_memory_bitmaps(); Thaw: + unlock_device_hotplug(); thaw_processes(); /* Don't bother checking whether freezer_test_done is true */ @@ -814,6 +816,7 @@ static int software_resume(void) pr_debug("PM: Loading hibernation image.\n"); + lock_device_hotplug(); error = create_basic_memory_bitmaps(); if (error) goto Thaw; @@ -827,6 +830,7 @@ static int software_resume(void) swsusp_free(); free_basic_memory_bitmaps(); Thaw: + unlock_device_hotplug(); thaw_processes(); Finish: pm_notifier_call_chain(PM_POST_RESTORE); diff --git a/kernel/power/user.c b/kernel/power/user.c index 63368163e98d..72e8f4fd616d 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -201,6 +201,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, if (!mutex_trylock(&pm_mutex)) return -EBUSY; + lock_device_hotplug(); data = filp->private_data; switch (cmd) { @@ -373,6 +374,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, } + unlock_device_hotplug(); mutex_unlock(&pm_mutex); return error; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ca1dd3aa5eee..53ad1325d7a7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -51,14 +51,10 @@ DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) { mutex_lock(&mem_hotplug_mutex); - - /* for exclusive hibernation if CONFIG_HIBERNATION=y */ - lock_system_sleep(); } void unlock_memory_hotplug(void) { - unlock_system_sleep(); mutex_unlock(&mem_hotplug_mutex); } -- cgit v1.3-8-gc7d7 From 6e556ce209b09528dbf1931cbfd5d323e1345926 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 5 Mar 2013 13:59:48 -0800 Subject: pidns: Don't have unshare(CLONE_NEWPID) imply CLONE_THREAD I goofed when I made unshare(CLONE_NEWPID) only work in a single-threaded process. There is no need for that requirement and in fact I analyzied things right for setns. The hard requirement is for tasks that share a VM to all be in the pid namespace and we properly prevent that in do_fork. Just to be certain I took a look through do_wait and forget_original_parent and there are no cases that make it any harder for children to be in the multiple pid namespaces than it is for children to be in the same pid namespace. I also performed a check to see if there were in uses of task->nsproxy_pid_ns I was not familiar with, but it is only used when allocating a new pid for a new task, and in checks to prevent craziness from happening. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- kernel/fork.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 66635c80a813..eb45f1d72703 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1817,11 +1817,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) */ if (unshare_flags & CLONE_NEWUSER) unshare_flags |= CLONE_THREAD | CLONE_FS; - /* - * If unsharing a pid namespace must also unshare the thread. - */ - if (unshare_flags & CLONE_NEWPID) - unshare_flags |= CLONE_THREAD; /* * If unsharing a thread from a thread group, must also unshare vm. */ -- cgit v1.3-8-gc7d7 From c7b96acf1456ef127fef461fcfedb54b81fecfbb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 20 Mar 2013 12:49:49 -0700 Subject: userns: Kill nsown_capable it makes the wrong thing easy nsown_capable is a special case of ns_capable essentially for just CAP_SETUID and CAP_SETGID. For the existing users it doesn't noticably simplify things and from the suggested patches I have seen it encourages people to do the wrong thing. So remove nsown_capable. Acked-by: Serge Hallyn Signed-off-by: "Eric W. Biederman" --- fs/namespace.c | 4 ++-- fs/open.c | 2 +- include/linux/capability.h | 1 - ipc/namespace.c | 2 +- kernel/capability.c | 12 ------------ kernel/groups.c | 2 +- kernel/pid_namespace.c | 2 +- kernel/sys.c | 20 ++++++++++---------- kernel/uid16.c | 2 +- kernel/utsname.c | 2 +- net/core/net_namespace.c | 2 +- net/core/scm.c | 4 ++-- 12 files changed, 21 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/fs/namespace.c b/fs/namespace.c index 877e4277f496..dc519a1437ee 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2929,8 +2929,8 @@ static int mntns_install(struct nsproxy *nsproxy, void *ns) struct path root; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_CHROOT) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; if (fs->users != 1) diff --git a/fs/open.c b/fs/open.c index 9156cb050d08..1c9d23f7e683 100644 --- a/fs/open.c +++ b/fs/open.c @@ -443,7 +443,7 @@ retry: goto dput_and_out; error = -EPERM; - if (!nsown_capable(CAP_SYS_CHROOT)) + if (!ns_capable(current_user_ns(), CAP_SYS_CHROOT)) goto dput_and_out; error = security_path_chroot(&path); if (error) diff --git a/include/linux/capability.h b/include/linux/capability.h index d9a4f7f40f32..a6ee1f9a5018 100644 --- a/include/linux/capability.h +++ b/include/linux/capability.h @@ -210,7 +210,6 @@ extern bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap); extern bool capable(int cap); extern bool ns_capable(struct user_namespace *ns, int cap); -extern bool nsown_capable(int cap); extern bool inode_capable(const struct inode *inode, int cap); extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap); diff --git a/ipc/namespace.c b/ipc/namespace.c index 7ee61bf44933..4be6581d3b7f 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -171,7 +171,7 @@ static int ipcns_install(struct nsproxy *nsproxy, void *new) { struct ipc_namespace *ns = new; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; /* Ditch state from the old ipc namespace */ diff --git a/kernel/capability.c b/kernel/capability.c index f6c2ce5701e1..6fc1c8af44df 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -432,18 +432,6 @@ bool capable(int cap) } EXPORT_SYMBOL(capable); -/** - * nsown_capable - Check superior capability to one's own user_ns - * @cap: The capability in question - * - * Return true if the current task has the given superior capability - * targeted at its own user namespace. - */ -bool nsown_capable(int cap) -{ - return ns_capable(current_user_ns(), cap); -} - /** * inode_capable - Check superior capability over inode * @inode: The inode in question diff --git a/kernel/groups.c b/kernel/groups.c index 6b2588dd04ff..90cf1c38c8ea 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!nsown_capable(CAP_SETGID)) + if (!ns_capable(current_user_ns(), CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48e..ee1f6bb83d67 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -329,7 +329,7 @@ static int pidns_install(struct nsproxy *nsproxy, void *ns) struct pid_namespace *ancestor, *new = ns; if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; /* diff --git a/kernel/sys.c b/kernel/sys.c index 771129b299f8..c18ecca575b4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -337,7 +337,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (rgid != (gid_t) -1) { if (gid_eq(old->gid, krgid) || gid_eq(old->egid, krgid) || - nsown_capable(CAP_SETGID)) + ns_capable(old->user_ns, CAP_SETGID)) new->gid = krgid; else goto error; @@ -346,7 +346,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid) if (gid_eq(old->gid, kegid) || gid_eq(old->egid, kegid) || gid_eq(old->sgid, kegid) || - nsown_capable(CAP_SETGID)) + ns_capable(old->user_ns, CAP_SETGID)) new->egid = kegid; else goto error; @@ -387,7 +387,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid) old = current_cred(); retval = -EPERM; - if (nsown_capable(CAP_SETGID)) + if (ns_capable(old->user_ns, CAP_SETGID)) new->gid = new->egid = new->sgid = new->fsgid = kgid; else if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->sgid)) new->egid = new->fsgid = kgid; @@ -471,7 +471,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) new->uid = kruid; if (!uid_eq(old->uid, kruid) && !uid_eq(old->euid, kruid) && - !nsown_capable(CAP_SETUID)) + !ns_capable(old->user_ns, CAP_SETUID)) goto error; } @@ -480,7 +480,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid) if (!uid_eq(old->uid, keuid) && !uid_eq(old->euid, keuid) && !uid_eq(old->suid, keuid) && - !nsown_capable(CAP_SETUID)) + !ns_capable(old->user_ns, CAP_SETUID)) goto error; } @@ -534,7 +534,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid) old = current_cred(); retval = -EPERM; - if (nsown_capable(CAP_SETUID)) { + if (ns_capable(old->user_ns, CAP_SETUID)) { new->suid = new->uid = kuid; if (!uid_eq(kuid, old->uid)) { retval = set_user(new); @@ -591,7 +591,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid) old = current_cred(); retval = -EPERM; - if (!nsown_capable(CAP_SETUID)) { + if (!ns_capable(old->user_ns, CAP_SETUID)) { if (ruid != (uid_t) -1 && !uid_eq(kruid, old->uid) && !uid_eq(kruid, old->euid) && !uid_eq(kruid, old->suid)) goto error; @@ -673,7 +673,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid) old = current_cred(); retval = -EPERM; - if (!nsown_capable(CAP_SETGID)) { + if (!ns_capable(old->user_ns, CAP_SETGID)) { if (rgid != (gid_t) -1 && !gid_eq(krgid, old->gid) && !gid_eq(krgid, old->egid) && !gid_eq(krgid, old->sgid)) goto error; @@ -744,7 +744,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid) if (uid_eq(kuid, old->uid) || uid_eq(kuid, old->euid) || uid_eq(kuid, old->suid) || uid_eq(kuid, old->fsuid) || - nsown_capable(CAP_SETUID)) { + ns_capable(old->user_ns, CAP_SETUID)) { if (!uid_eq(kuid, old->fsuid)) { new->fsuid = kuid; if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0) @@ -783,7 +783,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid) if (gid_eq(kgid, old->gid) || gid_eq(kgid, old->egid) || gid_eq(kgid, old->sgid) || gid_eq(kgid, old->fsgid) || - nsown_capable(CAP_SETGID)) { + ns_capable(old->user_ns, CAP_SETGID)) { if (!gid_eq(kgid, old->fsgid)) { new->fsgid = kgid; goto change_okay; diff --git a/kernel/uid16.c b/kernel/uid16.c index f6c83d7ef000..602e5bbbceff 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!nsown_capable(CAP_SETGID)) + if (!ns_capable(current_user_ns(), CAP_SETGID)) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/utsname.c b/kernel/utsname.c index 2fc8576efaa8..fd393124e507 100644 --- a/kernel/utsname.c +++ b/kernel/utsname.c @@ -114,7 +114,7 @@ static int utsns_install(struct nsproxy *nsproxy, void *new) struct uts_namespace *ns = new; if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; get_uts_ns(ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index f97652036754..81d3a9a08453 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -651,7 +651,7 @@ static int netns_install(struct nsproxy *nsproxy, void *ns) struct net *net = ns; if (!ns_capable(net->user_ns, CAP_SYS_ADMIN) || - !nsown_capable(CAP_SYS_ADMIN)) + !ns_capable(current_user_ns(), CAP_SYS_ADMIN)) return -EPERM; put_net(nsproxy->net_ns); diff --git a/net/core/scm.c b/net/core/scm.c index 03795d0147f2..c346f58d97c2 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -56,9 +56,9 @@ static __inline__ int scm_check_creds(struct ucred *creds) if ((creds->pid == task_tgid_vnr(current) || ns_capable(current->nsproxy->pid_ns->user_ns, CAP_SYS_ADMIN)) && ((uid_eq(uid, cred->uid) || uid_eq(uid, cred->euid) || - uid_eq(uid, cred->suid)) || nsown_capable(CAP_SETUID)) && + uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) && ((gid_eq(gid, cred->gid) || gid_eq(gid, cred->egid) || - gid_eq(gid, cred->sgid)) || nsown_capable(CAP_SETGID))) { + gid_eq(gid, cred->sgid)) || ns_capable(cred->user_ns, CAP_SETGID))) { return 0; } return -EPERM; -- cgit v1.3-8-gc7d7 From 0edd1b1784cbdad55aca2c1293be018f53c0ab1d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 16:37:22 -0700 Subject: nohz_full: Add full-system-idle state machine This commit adds the state machine that takes the per-CPU idle data as input and produces a full-system-idle indication as output. This state machine is driven out of RCU's quiescent-state-forcing mechanism, which invokes rcu_sysidle_check_cpu() to collect per-CPU idle state and then rcu_sysidle_report() to drive the state machine. The full-system-idle state is sampled using rcu_sys_is_idle(), which also drives the state machine if RCU is idle (and does so by forcing RCU to become non-idle). This function returns true if all but the timekeeping CPU (tick_do_timer_cpu) are idle and have been idle long enough to avoid memory contention on the full_sysidle_state state variable. The rcu_sysidle_force_exit() may be called externally to reset the state machine back into non-idle state. For large systems the state machine is driven out of RCU's force-quiescent-state logic, which provides good scalability at the price of millisecond-scale latencies on the transition to full-system-idle state. This is not so good for battery-powered systems, which are usually small enough that they don't need to care about scalability, but which do care deeply about energy efficiency. Small systems therefore drive the state machine directly out of the idle-entry code. The number of CPUs in a "small" system is defined by a new NO_HZ_FULL_SYSIDLE_SMALL Kconfig parameter, which defaults to 8. Note that this is a build-time definition. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Lai Jiangshan [ paulmck: Use true and false for boolean constants per Lai Jiangshan. ] Reviewed-by: Josh Triplett [ paulmck: Simplify logic and provide better comments for memory barriers, based on review comments and questions by Lai Jiangshan. ] --- include/linux/rcupdate.h | 18 +++ kernel/rcutree.c | 16 ++- kernel/rcutree.h | 5 + kernel/rcutree_plugin.h | 296 ++++++++++++++++++++++++++++++++++++++++++++++- kernel/time/Kconfig | 27 +++++ 5 files changed, 355 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 30bea9c25735..f1f1bc39346b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1011,4 +1011,22 @@ static inline bool rcu_is_nocb_cpu(int cpu) { return false; } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ +/* Only for use by adaptive-ticks code. */ +#ifdef CONFIG_NO_HZ_FULL_SYSIDLE +extern bool rcu_sys_is_idle(void); +extern void rcu_sysidle_force_exit(void); +#else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + +static inline bool rcu_sys_is_idle(void) +{ + return false; +} + +static inline void rcu_sysidle_force_exit(void) +{ +} + +#endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ + + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 7b5be56d95ae..eca70f4469c1 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -734,6 +734,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); + rcu_sysidle_check_cpu(rdp, isidle, maxj); return (rdp->dynticks_snap & 0x1) == 0; } @@ -1373,11 +1374,17 @@ int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) rsp->n_force_qs++; if (fqs_state == RCU_SAVE_DYNTICK) { /* Collect dyntick-idle snapshots. */ + if (is_sysidle_rcu_state(rsp)) { + isidle = 1; + maxj = jiffies - ULONG_MAX / 4; + } force_qs_rnp(rsp, dyntick_save_progress_counter, &isidle, &maxj); + rcu_sysidle_report_gp(rsp, isidle, maxj); fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ + isidle = 0; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2103,9 +2110,12 @@ static void force_qs_rnp(struct rcu_state *rsp, cpu = rnp->grplo; bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { - if ((rnp->qsmask & bit) != 0 && - f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) - mask |= bit; + if ((rnp->qsmask & bit) != 0) { + if ((rnp->qsmaskinit & bit) != 0) + *isidle = 0; + if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) + mask |= bit; + } } if (mask != 0) { diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 9dd8b177f1ac..6fd3659cf01a 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -555,6 +555,11 @@ static void rcu_kick_nohz_cpu(int cpu); static bool init_nocb_callback_list(struct rcu_data *rdp); static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj); +static bool is_sysidle_rcu_state(struct rcu_state *rsp); +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a7419ceb19ad..45ebba747af4 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include "time/tick-internal.h" #define RCU_KTHREAD_PRIO 1 @@ -2382,12 +2382,12 @@ static void rcu_kick_nohz_cpu(int cpu) * most active flavor of RCU. */ #ifdef CONFIG_PREEMPT_RCU -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_preempt_state; +static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; #else /* #ifdef CONFIG_PREEMPT_RCU */ -static struct rcu_state __maybe_unused *rcu_sysidle_state = &rcu_sched_state; +static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ -static int __maybe_unused full_sysidle_state; /* Current system-idle state. */ +static int full_sysidle_state; /* Current system-idle state. */ #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ @@ -2430,6 +2430,38 @@ static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); } +/* + * Unconditionally force exit from full system-idle state. This is + * invoked when a normal CPU exits idle, but must be called separately + * for the timekeeping CPU (tick_do_timer_cpu). The reason for this + * is that the timekeeping CPU is permitted to take scheduling-clock + * interrupts while the system is in system-idle state, and of course + * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock + * interrupt from any other type of interrupt. + */ +void rcu_sysidle_force_exit(void) +{ + int oldstate = ACCESS_ONCE(full_sysidle_state); + int newoldstate; + + /* + * Each pass through the following loop attempts to exit full + * system-idle state. If contention proves to be a problem, + * a trylock-based contention tree could be used here. + */ + while (oldstate > RCU_SYSIDLE_SHORT) { + newoldstate = cmpxchg(&full_sysidle_state, + oldstate, RCU_SYSIDLE_NOT); + if (oldstate == newoldstate && + oldstate == RCU_SYSIDLE_FULL_NOTED) { + rcu_kick_nohz_cpu(tick_do_timer_cpu); + return; /* We cleared it, done! */ + } + oldstate = newoldstate; + } + smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ +} + /* * Invoked to note entry to irq or task transition from idle. Note that * usermode execution does -not- count as idle here! The caller must @@ -2463,6 +2495,247 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) atomic_inc(&rdtp->dynticks_idle); smp_mb__after_atomic_inc(); WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); + + /* + * If we are the timekeeping CPU, we are permitted to be non-idle + * during a system-idle state. This must be the case, because + * the timekeeping CPU has to take scheduling-clock interrupts + * during the time that the system is transitioning to full + * system-idle state. This means that the timekeeping CPU must + * invoke rcu_sysidle_force_exit() directly if it does anything + * more than take a scheduling-clock interrupt. + */ + if (smp_processor_id() == tick_do_timer_cpu) + return; + + /* Update system-idle state: We are clearly no longer fully idle! */ + rcu_sysidle_force_exit(); +} + +/* + * Check to see if the current CPU is idle. Note that usermode execution + * does not count as idle. The caller must have disabled interrupts. + */ +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ + int cur; + unsigned long j; + struct rcu_dynticks *rdtp = rdp->dynticks; + + /* + * If some other CPU has already reported non-idle, if this is + * not the flavor of RCU that tracks sysidle state, or if this + * is an offline or the timekeeping CPU, nothing to do. + */ + if (!*isidle || rdp->rsp != rcu_sysidle_state || + cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) + return; + /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */ + + /* Pick up current idle and NMI-nesting counter and check. */ + cur = atomic_read(&rdtp->dynticks_idle); + if (cur & 0x1) { + *isidle = false; /* We are not idle! */ + return; + } + smp_mb(); /* Read counters before timestamps. */ + + /* Pick up timestamps. */ + j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); + /* If this CPU entered idle more recently, update maxj timestamp. */ + if (ULONG_CMP_LT(*maxj, j)) + *maxj = j; +} + +/* + * Is this the flavor of RCU that is handling full-system idle? + */ +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return rsp == rcu_sysidle_state; +} + +/* + * Return a delay in jiffies based on the number of CPUs, rcu_node + * leaf fanout, and jiffies tick rate. The idea is to allow larger + * systems more time to transition to full-idle state in order to + * avoid the cache thrashing that otherwise occur on the state variable. + * Really small systems (less than a couple of tens of CPUs) should + * instead use a single global atomically incremented counter, and later + * versions of this will automatically reconfigure themselves accordingly. + */ +static unsigned long rcu_sysidle_delay(void) +{ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return 0; + return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); +} + +/* + * Advance the full-system-idle state. This is invoked when all of + * the non-timekeeping CPUs are idle. + */ +static void rcu_sysidle(unsigned long j) +{ + /* Check the current state. */ + switch (ACCESS_ONCE(full_sysidle_state)) { + case RCU_SYSIDLE_NOT: + + /* First time all are idle, so note a short idle period. */ + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; + break; + + case RCU_SYSIDLE_SHORT: + + /* + * Idle for a bit, time to advance to next state? + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); + break; + + case RCU_SYSIDLE_LONG: + + /* + * Do an additional check pass before advancing to full. + * cmpxchg failure means race with non-idle, let them win. + */ + if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) + (void)cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); + break; + + default: + break; + } +} + +/* + * Found a non-idle non-timekeeping CPU, so kick the system-idle state + * back to the beginning. + */ +static void rcu_sysidle_cancel(void) +{ + smp_mb(); + ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; +} + +/* + * Update the sysidle state based on the results of a force-quiescent-state + * scan of the CPUs' dyntick-idle state. + */ +static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, + unsigned long maxj, bool gpkt) +{ + if (rsp != rcu_sysidle_state) + return; /* Wrong flavor, ignore. */ + if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) + return; /* Running state machine from timekeeping CPU. */ + if (isidle) + rcu_sysidle(maxj); /* More idle! */ + else + rcu_sysidle_cancel(); /* Idle is over. */ +} + +/* + * Wrapper for rcu_sysidle_report() when called from the grace-period + * kthread's context. + */ +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ + rcu_sysidle_report(rsp, isidle, maxj, true); +} + +/* Callback and function for forcing an RCU grace period. */ +struct rcu_sysidle_head { + struct rcu_head rh; + int inuse; +}; + +static void rcu_sysidle_cb(struct rcu_head *rhp) +{ + struct rcu_sysidle_head *rshp; + + /* + * The following memory barrier is needed to replace the + * memory barriers that would normally be in the memory + * allocator. + */ + smp_mb(); /* grace period precedes setting inuse. */ + + rshp = container_of(rhp, struct rcu_sysidle_head, rh); + ACCESS_ONCE(rshp->inuse) = 0; +} + +/* + * Check to see if the system is fully idle, other than the timekeeping CPU. + * The caller must have disabled interrupts. + */ +bool rcu_sys_is_idle(void) +{ + static struct rcu_sysidle_head rsh; + int rss = ACCESS_ONCE(full_sysidle_state); + + if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) + return false; + + /* Handle small-system case by doing a full scan of CPUs. */ + if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { + int oldrss = rss - 1; + + /* + * One pass to advance to each state up to _FULL. + * Give up if any pass fails to advance the state. + */ + while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { + int cpu; + bool isidle = true; + unsigned long maxj = jiffies - ULONG_MAX / 4; + struct rcu_data *rdp; + + /* Scan all the CPUs looking for nonidle CPUs. */ + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); + rcu_sysidle_check_cpu(rdp, &isidle, &maxj); + if (!isidle) + break; + } + rcu_sysidle_report(rcu_sysidle_state, + isidle, maxj, false); + oldrss = rss; + rss = ACCESS_ONCE(full_sysidle_state); + } + } + + /* If this is the first observation of an idle period, record it. */ + if (rss == RCU_SYSIDLE_FULL) { + rss = cmpxchg(&full_sysidle_state, + RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); + return rss == RCU_SYSIDLE_FULL; + } + + smp_mb(); /* ensure rss load happens before later caller actions. */ + + /* If already fully idle, tell the caller (in case of races). */ + if (rss == RCU_SYSIDLE_FULL_NOTED) + return true; + + /* + * If we aren't there yet, and a grace period is not in flight, + * initiate a grace period. Either way, tell the caller that + * we are not there yet. We use an xchg() rather than an assignment + * to make up for the memory barriers that would otherwise be + * provided by the memory allocator. + */ + if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && + !rcu_gp_in_progress(rcu_sysidle_state) && + !rsh.inuse && xchg(&rsh.inuse, 1) == 0) + call_rcu(&rsh.rh, rcu_sysidle_cb); + return false; } /* @@ -2483,6 +2756,21 @@ static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) { } +static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, + unsigned long *maxj) +{ +} + +static bool is_sysidle_rcu_state(struct rcu_state *rsp) +{ + return false; +} + +static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, + unsigned long maxj) +{ +} + static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) { } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index c7d2fd67799e..3381f098070f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -157,6 +157,33 @@ config NO_HZ_FULL_SYSIDLE Say N if you are unsure. +config NO_HZ_FULL_SYSIDLE_SMALL + int "Number of CPUs above which large-system approach is used" + depends on NO_HZ_FULL_SYSIDLE + range 1 NR_CPUS + default 8 + help + The full-system idle detection mechanism takes a lazy approach + on large systems, as is required to attain decent scalability. + However, on smaller systems, scalability is not anywhere near as + large a concern as is energy efficiency. The sysidle subsystem + therefore uses a fast but non-scalable algorithm for small + systems and a lazier but scalable algorithm for large systems. + This Kconfig parameter defines the number of CPUs in the largest + system that will be considered to be "small". + + The default value will be fine in most cases. Battery-powered + systems that (1) enable NO_HZ_FULL_SYSIDLE, (2) have larger + numbers of CPUs, and (3) are suffering from battery-lifetime + problems due to long sysidle latencies might wish to experiment + with larger values for this Kconfig parameter. On the other + hand, they might be even better served by disabling NO_HZ_FULL + entirely, given that NO_HZ_FULL is intended for HPC and + real-time workloads that at present do not tend to be run on + battery-powered systems. + + Take the default if you are unsure. + config NO_HZ bool "Old Idle dynticks config" depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS -- cgit v1.3-8-gc7d7 From eb75767be0e514f97bf1b5cec763696cfc7f7e2a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 21 Jun 2013 17:10:40 -0700 Subject: nohz_full: Force RCU's grace-period kthreads onto timekeeping CPU Because RCU's quiescent-state-forcing mechanism is used to drive the full-system-idle state machine, and because this mechanism is executed by RCU's grace-period kthreads, this commit forces these kthreads to run on the timekeeping CPU (tick_do_timer_cpu). To do otherwise would mean that the RCU grace-period kthreads would force the system into non-idle state every time they drove the state machine, which would be just a bit on the futile side. Signed-off-by: Paul E. McKenney Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Lai Jiangshan Reviewed-by: Josh Triplett --- kernel/rcutree.c | 1 + kernel/rcutree.h | 1 + kernel/rcutree_plugin.h | 21 ++++++++++++++++++++- 3 files changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcutree.c b/kernel/rcutree.c index eca70f4469c1..64eaafb6c8f7 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1303,6 +1303,7 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); rsp->gp_flags = 0; /* Clear all flags: New grace period. */ diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 6fd3659cf01a..5f97eab602cd 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -560,6 +560,7 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, static bool is_sysidle_rcu_state(struct rcu_state *rsp); static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj); +static void rcu_bind_gp_kthread(void); static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); #endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 45ebba747af4..130c97b027f2 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2531,7 +2531,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, if (!*isidle || rdp->rsp != rcu_sysidle_state || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; - /* WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); */ + if (rcu_gp_in_progress(rdp->rsp)) + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); /* Pick up current idle and NMI-nesting counter and check. */ cur = atomic_read(&rdtp->dynticks_idle); @@ -2556,6 +2557,20 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return rsp == rcu_sysidle_state; } +/* + * Bind the grace-period kthread for the sysidle flavor of RCU to the + * timekeeping CPU. + */ +static void rcu_bind_gp_kthread(void) +{ + int cpu = ACCESS_ONCE(tick_do_timer_cpu); + + if (cpu < 0 || cpu >= nr_cpu_ids) + return; + if (raw_smp_processor_id() != cpu) + set_cpus_allowed_ptr(current, cpumask_of(cpu)); +} + /* * Return a delay in jiffies based on the number of CPUs, rcu_node * leaf fanout, and jiffies tick rate. The idea is to allow larger @@ -2766,6 +2781,10 @@ static bool is_sysidle_rcu_state(struct rcu_state *rsp) return false; } +static void rcu_bind_gp_kthread(void) +{ +} + static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, unsigned long maxj) { -- cgit v1.3-8-gc7d7 From ae23bff1d71f8b416ed740bc458df67355c77c92 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Sat, 24 Aug 2013 16:45:54 +0200 Subject: perf: Prevent race in unthrottling code The current throttling code triggers WARN below via following workload (only hit on AMD machine with 48 CPUs): # while [ 1 ]; do perf record perf bench sched messaging; done WARNING: at arch/x86/kernel/cpu/perf_event.c:1054 x86_pmu_start+0xc6/0x100() SNIP Call Trace: [] dump_stack+0x19/0x1b [] warn_slowpath_common+0x61/0x80 [] warn_slowpath_null+0x1a/0x20 [] x86_pmu_start+0xc6/0x100 [] perf_adjust_freq_unthr_context.part.75+0x182/0x1a0 [] perf_event_task_tick+0xc8/0xf0 [] scheduler_tick+0xd1/0x140 [] update_process_times+0x66/0x80 [] tick_sched_handle.isra.15+0x25/0x60 [] tick_sched_timer+0x41/0x60 [] __run_hrtimer+0x74/0x1d0 [] ? tick_sched_handle.isra.15+0x60/0x60 [] hrtimer_interrupt+0xf7/0x240 [] smp_apic_timer_interrupt+0x69/0x9c [] apic_timer_interrupt+0x6d/0x80 [] ? __perf_event_task_sched_in+0x184/0x1a0 [] ? kfree_skbmem+0x37/0x90 [] ? __slab_free+0x1ac/0x30f [] ? kfree+0xfd/0x130 [] kmem_cache_free+0x1b2/0x1d0 [] kfree_skbmem+0x37/0x90 [] consume_skb+0x34/0x80 [] unix_stream_recvmsg+0x4e7/0x820 [] sock_aio_read.part.7+0x116/0x130 [] ? __perf_sw_event+0x19c/0x1e0 [] sock_aio_read+0x21/0x30 [] do_sync_read+0x80/0xb0 [] vfs_read+0x145/0x170 [] SyS_read+0x49/0xa0 [] ? __audit_syscall_exit+0x1f6/0x2a0 [] system_call_fastpath+0x16/0x1b ---[ end trace 622b7e226c4a766a ]--- The reason is a race in perf_event_task_tick() throttling code. The race flow (simplified code): - perf_throttled_count is per cpu variable and is CPU throttling flag, here starting with 0 - perf_throttled_seq is sequence/domain for allowed count of interrupts within the tick, gets increased each tick on single CPU (CPU bounded event): ... workload perf_event_task_tick: | | T0 inc(perf_throttled_seq) | T1 needs_unthr = xchg(perf_throttled_count, 0) == 0 tick gets interrupted: ... event gets throttled under new seq ... T2 last NMI comes, event is throttled - inc(perf_throttled_count) back to tick: | perf_adjust_freq_unthr_context: | | T3 unthrottling is skiped for event (needs_unthr == 0) | T4 event is stop and started via freq adjustment | tick ends ... workload ... no sample is hit for event ... perf_event_task_tick: | | T5 needs_unthr = xchg(perf_throttled_count, 0) != 0 (from T2) | T6 unthrottling is done on event (interrupts == MAX_INTERRUPTS) | event is already started (from T4) -> WARN Fixing this by not checking needs_unthr again and thus check all events for unthrottling. Signed-off-by: Jiri Olsa Reported-by: Jan Stancek Suggested-by: Peter Zijlstra Cc: Corey Ashford Cc: Frederic Weisbecker Cc: Namhyung Kim Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1377355554-8934-1-git-send-email-jolsa@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f86599e8c123..258eaaffe95a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2712,7 +2712,7 @@ static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx, hwc = &event->hw; - if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) { + if (hwc->interrupts == MAX_INTERRUPTS) { hwc->interrupts = 0; perf_log_throttle(event, 1); event->pmu->start(event, 0); -- cgit v1.3-8-gc7d7 From 95a79b805b935f4a7b685aa8a117d916c638323e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 6 Aug 2013 17:36:41 +0900 Subject: sched: Remove one division operation in find_busiest_queue() Remove one division operation in find_busiest_queue() by using crosswise multiplication: wl_i / power_i > wl_j / power_j := wl_i * power_j > wl_j * power_i Signed-off-by: Joonsoo Kim [ Expanded the changelog. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375778203-31343-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f918635efe09..8aa217f62a9e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4968,7 +4968,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, struct sched_group *group) { struct rq *busiest = NULL, *rq; - unsigned long max_load = 0; + unsigned long busiest_load = 0, busiest_power = 1; int i; for_each_cpu(i, sched_group_cpus(group)) { @@ -4998,11 +4998,15 @@ static struct rq *find_busiest_queue(struct lb_env *env, * the weighted_cpuload() scaled with the cpu power, so that * the load can be moved away from the cpu that is potentially * running at a lower capacity. + * + * Thus we're looking for max(wl_i / power_i), crosswise + * multiplication to rid ourselves of the division works out + * to: wl_i * power_j > wl_j * power_i; where j is our + * previous maximum. */ - wl = (wl * SCHED_POWER_SCALE) / power; - - if (wl > max_load) { - max_load = wl; + if (wl * busiest_power > busiest_load * power) { + busiest_load = wl; + busiest_power = power; busiest = rq; } } -- cgit v1.3-8-gc7d7 From 23f0d2093c789e612185180c468fa09063834e87 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 6 Aug 2013 17:36:42 +0900 Subject: sched: Factor out code to should_we_balance() Now checking whether this cpu is appropriate to balance or not is embedded into update_sg_lb_stats() and this checking has no direct relationship to this function. There is not enough reason to place this checking at update_sg_lb_stats(), except saving one iteration for sched_group_cpus. In this patch, I factor out this checking to should_we_balance() function. And before doing actual work for load_balancing, check whether this cpu is appropriate to balance via should_we_balance(). If this cpu is not a candidate for balancing, it quit the work immediately. With this change, we can save two memset cost and can expect better compiler optimization. Below is result of this patch. * Vanilla * text data bss dec hex filename 34499 1136 116 35751 8ba7 kernel/sched/fair.o * Patched * text data bss dec hex filename 34243 1136 116 35495 8aa7 kernel/sched/fair.o In addition, rename @balance to @continue_balancing in order to represent its purpose more clearly. Signed-off-by: Joonsoo Kim [ s/should_balance/continue_balancing/g ] Reviewed-by: Paul Turner [ Made style changes and a fix in should_we_balance(). ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375778203-31343-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 107 ++++++++++++++++++++++++++-------------------------- 1 file changed, 53 insertions(+), 54 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8aa217f62a9e..9a6daf86a76a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4463,22 +4463,17 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) * @group: sched_group whose statistics are to be updated. * @load_idx: Load index of sched_domain of this_cpu for load calc. * @local_group: Does group contain this_cpu. - * @balance: Should we balance. * @sgs: variable to hold the statistics for this group. */ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, - int local_group, int *balance, struct sg_lb_stats *sgs) + int local_group, struct sg_lb_stats *sgs) { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; - unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long avg_load_per_task = 0; int i; - if (local_group) - balance_cpu = group_balance_cpu(group); - /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; min_cpu_load = ~0UL; @@ -4492,12 +4487,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu && - cpumask_test_cpu(i, sched_group_mask(group))) { - first_idle_cpu = 1; - balance_cpu = i; - } - load = target_load(i, load_idx); } else { load = source_load(i, load_idx); @@ -4519,22 +4508,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->idle_cpus++; } - /* - * First idle cpu or the first cpu(busiest) in this sched group - * is eligible for doing load balancing at this and above - * domains. In the newly idle case, we will allow all the cpu's - * to do the newly idle load balance. - */ - if (local_group) { - if (env->idle != CPU_NEWLY_IDLE) { - if (balance_cpu != env->dst_cpu) { - *balance = 0; - return; - } - update_group_power(env->sd, env->dst_cpu); - } else if (time_after_eq(jiffies, group->sgp->next_update)) - update_group_power(env->sd, env->dst_cpu); - } + if (local_group && (env->idle != CPU_NEWLY_IDLE || + time_after_eq(jiffies, group->sgp->next_update))) + update_group_power(env->sd, env->dst_cpu); /* Adjust by relative CPU power of the group */ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; @@ -4613,7 +4589,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, * @sds: variable to hold the statistics for this sched_domain. */ static inline void update_sd_lb_stats(struct lb_env *env, - int *balance, struct sd_lb_stats *sds) + struct sd_lb_stats *sds) { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; @@ -4630,10 +4606,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs); - - if (local_group && !(*balance)) - return; + update_sg_lb_stats(env, sg, load_idx, local_group, &sgs); sds->total_load += sgs.group_load; sds->total_pwr += sg->sgp->power; @@ -4866,8 +4839,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * to restore balance. * * @env: The load balancing environment. - * @balance: Pointer to a variable indicating if this_cpu - * is the appropriate cpu to perform load balancing at this_level. * * Returns: - the busiest group if imbalance exists. * - If no imbalance and user has opted for power-savings balance, @@ -4875,7 +4846,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * put to idle by rebalancing its tasks onto our group. */ static struct sched_group * -find_busiest_group(struct lb_env *env, int *balance) +find_busiest_group(struct lb_env *env) { struct sd_lb_stats sds; @@ -4885,14 +4856,7 @@ find_busiest_group(struct lb_env *env, int *balance) * Compute the various statistics relavent for load balancing at * this level. */ - update_sd_lb_stats(env, balance, &sds); - - /* - * this_cpu is not the appropriate cpu to perform load balancing at - * this level. - */ - if (!(*balance)) - goto ret; + update_sd_lb_stats(env, &sds); if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) @@ -4956,7 +4920,6 @@ force_balance: return sds.busiest; out_balanced: -ret: env->imbalance = 0; return NULL; } @@ -5043,13 +5006,47 @@ static int need_active_balance(struct lb_env *env) static int active_load_balance_cpu_stop(void *data); +static int should_we_balance(struct lb_env *env) +{ + struct sched_group *sg = env->sd->groups; + struct cpumask *sg_cpus, *sg_mask; + int cpu, balance_cpu = -1; + + /* + * In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. + */ + if (env->idle == CPU_NEWLY_IDLE) + return 1; + + sg_cpus = sched_group_cpus(sg); + sg_mask = sched_group_mask(sg); + /* Try to find first idle cpu */ + for_each_cpu_and(cpu, sg_cpus, env->cpus) { + if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu)) + continue; + + balance_cpu = cpu; + break; + } + + if (balance_cpu == -1) + balance_cpu = group_balance_cpu(sg); + + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above domains. + */ + return balance_cpu != env->dst_cpu; +} + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, - int *balance) + int *continue_balancing) { int ld_moved, cur_ld_moved, active_balance = 0; struct sched_group *group; @@ -5079,11 +5076,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, schedstat_inc(sd, lb_count[idle]); redo: - group = find_busiest_group(&env, balance); - - if (*balance == 0) + if (!should_we_balance(&env)) { + *continue_balancing = 0; goto out_balanced; + } + group = find_busiest_group(&env); if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -5296,7 +5294,7 @@ void idle_balance(int this_cpu, struct rq *this_rq) rcu_read_lock(); for_each_domain(this_cpu, sd) { unsigned long interval; - int balance = 1; + int continue_balancing = 1; if (!(sd->flags & SD_LOAD_BALANCE)) continue; @@ -5304,7 +5302,8 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ pulled_task = load_balance(this_cpu, this_rq, - sd, CPU_NEWLY_IDLE, &balance); + sd, CPU_NEWLY_IDLE, + &continue_balancing); } interval = msecs_to_jiffies(sd->balance_interval); @@ -5542,7 +5541,7 @@ void update_max_interval(void) */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { - int balance = 1; + int continue_balancing = 1; struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; @@ -5574,7 +5573,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(cpu, rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &continue_balancing)) { /* * The LBF_SOME_PINNED logic could have changed * env->dst_cpu, so we can't know our idle @@ -5597,7 +5596,7 @@ out: * CPU in our sched group which is doing load balancing more * actively. */ - if (!balance) + if (!continue_balancing) break; } rcu_read_unlock(); -- cgit v1.3-8-gc7d7 From 56cf515b4b1567c4e8fa9926175b40c66b9ec472 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 6 Aug 2013 17:36:43 +0900 Subject: sched: Clean-up struct sd_lb_stat There is no reason to maintain separate variables for this_group and busiest_group in sd_lb_stat, except saving some space. But this structure is always allocated in stack, so this saving isn't really benificial [peterz: reducing stack space is good; in this case readability increases enough that I think its still beneficial] This patch unify these variables, so IMO, readability may be improved. Signed-off-by: Joonsoo Kim [ Rename this to local -- avoids confusion between this_cpu and the C++ this pointer. ] Reviewed-by: Paul Turner [ Lots of style edits, a few fixes and a rename. ] Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1375778203-31343-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 229 ++++++++++++++++++++++++++-------------------------- 1 file changed, 114 insertions(+), 115 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9a6daf86a76a..2da80a55827b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4231,36 +4231,6 @@ static unsigned long task_h_load(struct task_struct *p) #endif /********** Helpers for find_busiest_group ************************/ -/* - * sd_lb_stats - Structure to store the statistics of a sched_domain - * during load balancing. - */ -struct sd_lb_stats { - struct sched_group *busiest; /* Busiest group in this sd */ - struct sched_group *this; /* Local group in this sd */ - unsigned long total_load; /* Total load of all groups in sd */ - unsigned long total_pwr; /* Total power of all groups in sd */ - unsigned long avg_load; /* Average load across all groups in sd */ - - /** Statistics of this group */ - unsigned long this_load; - unsigned long this_load_per_task; - unsigned long this_nr_running; - unsigned long this_has_capacity; - unsigned int this_idle_cpus; - - /* Statistics of the busiest group */ - unsigned int busiest_idle_cpus; - unsigned long max_load; - unsigned long busiest_load_per_task; - unsigned long busiest_nr_running; - unsigned long busiest_group_capacity; - unsigned long busiest_has_capacity; - unsigned int busiest_group_weight; - - int group_imb; /* Is there imbalance in this sd */ -}; - /* * sg_lb_stats - stats of a sched_group required for load_balancing */ @@ -4269,6 +4239,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ + unsigned long load_per_task; unsigned long group_capacity; unsigned long idle_cpus; unsigned long group_weight; @@ -4276,6 +4247,21 @@ struct sg_lb_stats { int group_has_capacity; /* Is there extra capacity in the group? */ }; +/* + * sd_lb_stats - Structure to store the statistics of a sched_domain + * during load balancing. + */ +struct sd_lb_stats { + struct sched_group *busiest; /* Busiest group in this sd */ + struct sched_group *local; /* Local group in this sd */ + unsigned long total_load; /* Total load of all groups in sd */ + unsigned long total_pwr; /* Total power of all groups in sd */ + unsigned long avg_load; /* Average load across all groups in sd */ + + struct sg_lb_stats local_stat; /* Statistics of the local group */ + struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ +}; + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -4490,6 +4476,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); + if (load > max_cpu_load) max_cpu_load = load; if (min_cpu_load > load) @@ -4531,10 +4518,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; - sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power, - SCHED_POWER_SCALE); + sgs->group_capacity = + DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); + if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(env->sd, group); + sgs->group_weight = group->group_weight; if (sgs->group_capacity > sgs->sum_nr_running) @@ -4556,7 +4545,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, struct sched_group *sg, struct sg_lb_stats *sgs) { - if (sgs->avg_load <= sds->max_load) + if (sgs->avg_load <= sds->busiest_stat.avg_load) return false; if (sgs->sum_nr_running > sgs->group_capacity) @@ -4593,7 +4582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; - struct sg_lb_stats sgs; + struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; if (child && child->flags & SD_PREFER_SIBLING) @@ -4602,14 +4591,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, load_idx = get_sd_load_idx(env->sd, env->idle); do { + struct sg_lb_stats *sgs = &tmp_sgs; int local_group; local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); - memset(&sgs, 0, sizeof(sgs)); - update_sg_lb_stats(env, sg, load_idx, local_group, &sgs); + if (local_group) { + sds->local = sg; + sgs = &sds->local_stat; + } - sds->total_load += sgs.group_load; - sds->total_pwr += sg->sgp->power; + memset(sgs, 0, sizeof(*sgs)); + update_sg_lb_stats(env, sg, load_idx, local_group, sgs); /* * In case the child domain prefers tasks go to siblings @@ -4621,26 +4613,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, * heaviest group when it is already under-utilized (possible * with a large weight task outweighs the tasks on the system). */ - if (prefer_sibling && !local_group && sds->this_has_capacity) - sgs.group_capacity = min(sgs.group_capacity, 1UL); + if (prefer_sibling && !local_group && + sds->local && sds->local_stat.group_has_capacity) + sgs->group_capacity = min(sgs->group_capacity, 1UL); - if (local_group) { - sds->this_load = sgs.avg_load; - sds->this = sg; - sds->this_nr_running = sgs.sum_nr_running; - sds->this_load_per_task = sgs.sum_weighted_load; - sds->this_has_capacity = sgs.group_has_capacity; - sds->this_idle_cpus = sgs.idle_cpus; - } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) { - sds->max_load = sgs.avg_load; + /* Now, start updating sd_lb_stats */ + sds->total_load += sgs->group_load; + sds->total_pwr += sg->sgp->power; + + if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; - sds->busiest_nr_running = sgs.sum_nr_running; - sds->busiest_idle_cpus = sgs.idle_cpus; - sds->busiest_group_capacity = sgs.group_capacity; - sds->busiest_load_per_task = sgs.sum_weighted_load; - sds->busiest_has_capacity = sgs.group_has_capacity; - sds->busiest_group_weight = sgs.group_weight; - sds->group_imb = sgs.group_imb; + sds->busiest_stat = *sgs; } sg = sg->next; @@ -4684,8 +4667,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (env->dst_cpu > busiest_cpu) return 0; - env->imbalance = DIV_ROUND_CLOSEST( - sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE); + env->imbalance = DIV_ROUND_CLOSEST(sds->busiest_stat.avg_load * + sds->busiest->sgp->power, SCHED_POWER_SCALE); return 1; } @@ -4703,24 +4686,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) unsigned long tmp, pwr_now = 0, pwr_move = 0; unsigned int imbn = 2; unsigned long scaled_busy_load_per_task; + struct sg_lb_stats *local, *busiest; - if (sds->this_nr_running) { - sds->this_load_per_task /= sds->this_nr_running; - if (sds->busiest_load_per_task > - sds->this_load_per_task) - imbn = 1; - } else { - sds->this_load_per_task = - cpu_avg_load_per_task(env->dst_cpu); - } + local = &sds->local_stat; + busiest = &sds->busiest_stat; - scaled_busy_load_per_task = sds->busiest_load_per_task - * SCHED_POWER_SCALE; - scaled_busy_load_per_task /= sds->busiest->sgp->power; + if (!local->sum_nr_running) + local->load_per_task = cpu_avg_load_per_task(env->dst_cpu); + else if (busiest->load_per_task > local->load_per_task) + imbn = 1; - if (sds->max_load - sds->this_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { - env->imbalance = sds->busiest_load_per_task; + scaled_busy_load_per_task = + (busiest->load_per_task * SCHED_POWER_SCALE) / + sds->busiest->sgp->power; + + if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= + (scaled_busy_load_per_task * imbn)) { + env->imbalance = busiest->load_per_task; return; } @@ -4731,33 +4713,36 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) */ pwr_now += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load); - pwr_now += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load); + min(busiest->load_per_task, busiest->avg_load); + pwr_now += sds->local->sgp->power * + min(local->load_per_task, local->avg_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / + tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / sds->busiest->sgp->power; - if (sds->max_load > tmp) + if (busiest->avg_load > tmp) { pwr_move += sds->busiest->sgp->power * - min(sds->busiest_load_per_task, sds->max_load - tmp); + min(busiest->load_per_task, + busiest->avg_load - tmp); + } /* Amount of load we'd add */ - if (sds->max_load * sds->busiest->sgp->power < - sds->busiest_load_per_task * SCHED_POWER_SCALE) - tmp = (sds->max_load * sds->busiest->sgp->power) / - sds->this->sgp->power; - else - tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) / - sds->this->sgp->power; - pwr_move += sds->this->sgp->power * - min(sds->this_load_per_task, sds->this_load + tmp); + if (busiest->avg_load * sds->busiest->sgp->power < + busiest->load_per_task * SCHED_POWER_SCALE) { + tmp = (busiest->avg_load * sds->busiest->sgp->power) / + sds->local->sgp->power; + } else { + tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / + sds->local->sgp->power; + } + pwr_move += sds->local->sgp->power * + min(local->load_per_task, local->avg_load + tmp); pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ if (pwr_move > pwr_now) - env->imbalance = sds->busiest_load_per_task; + env->imbalance = busiest->load_per_task; } /** @@ -4769,11 +4754,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds) { unsigned long max_pull, load_above_capacity = ~0UL; + struct sg_lb_stats *local, *busiest; + + local = &sds->local_stat; + if (local->sum_nr_running) { + local->load_per_task = + local->sum_weighted_load / local->sum_nr_running; + } - sds->busiest_load_per_task /= sds->busiest_nr_running; - if (sds->group_imb) { - sds->busiest_load_per_task = - min(sds->busiest_load_per_task, sds->avg_load); + busiest = &sds->busiest_stat; + /* busiest must have some tasks */ + busiest->load_per_task = + busiest->sum_weighted_load / busiest->sum_nr_running; + + if (busiest->group_imb) { + busiest->load_per_task = + min(busiest->load_per_task, sds->avg_load); } /* @@ -4781,20 +4777,19 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (sds->max_load < sds->avg_load) { + if (busiest->avg_load < sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } - if (!sds->group_imb) { + if (!busiest->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. */ - load_above_capacity = (sds->busiest_nr_running - - sds->busiest_group_capacity); + load_above_capacity = + (busiest->sum_nr_running - busiest->group_capacity); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->sgp->power; } @@ -4808,12 +4803,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * Be careful of negative numbers as they'll appear as very large values * with unsigned longs. */ - max_pull = min(sds->max_load - sds->avg_load, load_above_capacity); + max_pull = min(busiest->avg_load - sds->avg_load, + load_above_capacity); /* How much load to actually move to equalise the imbalance */ - env->imbalance = min(max_pull * sds->busiest->sgp->power, - (sds->avg_load - sds->this_load) * sds->this->sgp->power) - / SCHED_POWER_SCALE; + env->imbalance = min( + max_pull * sds->busiest->sgp->power, + (sds->avg_load - local->avg_load) * sds->local->sgp->power + ) / SCHED_POWER_SCALE; /* * if *imbalance is less than the average load per runnable task @@ -4821,9 +4818,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * a think about bumping its value to force at least one task to be * moved */ - if (env->imbalance < sds->busiest_load_per_task) + if (env->imbalance < busiest->load_per_task) return fix_small_imbalance(env, sds); - } /******* find_busiest_group() helpers end here *********************/ @@ -4845,9 +4841,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * return the least loaded group whose CPUs can be * put to idle by rebalancing its tasks onto our group. */ -static struct sched_group * -find_busiest_group(struct lb_env *env) +static struct sched_group *find_busiest_group(struct lb_env *env) { + struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; memset(&sds, 0, sizeof(sds)); @@ -4857,13 +4853,15 @@ find_busiest_group(struct lb_env *env) * this level. */ update_sd_lb_stats(env, &sds); + local = &sds.local_stat; + busiest = &sds.busiest_stat; if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; /* There is no busy sibling group to pull tasks from */ - if (!sds.busiest || sds.busiest_nr_running == 0) + if (!sds.busiest || busiest->sum_nr_running == 0) goto out_balanced; sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr; @@ -4873,26 +4871,26 @@ find_busiest_group(struct lb_env *env) * work because they assumes all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ - if (sds.group_imb) + if (busiest->group_imb) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity && - !sds.busiest_has_capacity) + if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity && + !busiest->group_has_capacity) goto force_balance; /* * If the local group is more busy than the selected busiest group * don't try and pull any tasks. */ - if (sds.this_load >= sds.max_load) + if (local->avg_load >= busiest->avg_load) goto out_balanced; /* * Don't pull any tasks if this group is already above the domain * average load. */ - if (sds.this_load >= sds.avg_load) + if (local->avg_load >= sds.avg_load) goto out_balanced; if (env->idle == CPU_IDLE) { @@ -4902,15 +4900,16 @@ find_busiest_group(struct lb_env *env) * there is no imbalance between this and busiest group * wrt to idle cpu's, it is balanced. */ - if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) && - sds.busiest_nr_running <= sds.busiest_group_weight) + if ((local->idle_cpus < busiest->idle_cpus) && + busiest->sum_nr_running <= busiest->group_weight) goto out_balanced; } else { /* * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use * imbalance_pct to be conservative. */ - if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load) + if (100 * busiest->avg_load <= + env->sd->imbalance_pct * local->avg_load) goto out_balanced; } -- cgit v1.3-8-gc7d7 From 147c5fc2bad780d8093b547f2baa204e78107faf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 15:22:57 +0200 Subject: sched/fair: Shrink sg_lb_stats and play memset games We can shrink sg_lb_stats because rq::nr_running is an unsigned int and cpu numbers are 'int' Before: sgs: /* size: 72, cachelines: 2, members: 10 */ sds: /* size: 184, cachelines: 3, members: 7 */ After: sgs: /* size: 56, cachelines: 1, members: 10 */ sds: /* size: 152, cachelines: 3, members: 7 */ Further we can avoid clearing all of sds since we do a total clear/assignment of sg_stats in update_sg_lb_stats() with exception of busiest_stat.avg_load which is referenced in update_sd_pick_busiest(). Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-0klzmz9okll8wc0nsudguc9p@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2da80a55827b..4c6a8a5a789a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4237,12 +4237,12 @@ static unsigned long task_h_load(struct task_struct *p) struct sg_lb_stats { unsigned long avg_load; /*Avg load across the CPUs of the group */ unsigned long group_load; /* Total load over the CPUs of the group */ - unsigned long sum_nr_running; /* Nr tasks running in the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; - unsigned long group_capacity; - unsigned long idle_cpus; - unsigned long group_weight; + unsigned int sum_nr_running; /* Nr tasks running in the group */ + unsigned int group_capacity; + unsigned int idle_cpus; + unsigned int group_weight; int group_imb; /* Is there an imbalance in the group ? */ int group_has_capacity; /* Is there extra capacity in the group? */ }; @@ -4258,10 +4258,29 @@ struct sd_lb_stats { unsigned long total_pwr; /* Total power of all groups in sd */ unsigned long avg_load; /* Average load across all groups in sd */ - struct sg_lb_stats local_stat; /* Statistics of the local group */ struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */ + struct sg_lb_stats local_stat; /* Statistics of the local group */ }; +static inline void init_sd_lb_stats(struct sd_lb_stats *sds) +{ + /* + * Skimp on the clearing to avoid duplicate work. We can avoid clearing + * local_stat because update_sg_lb_stats() does a full clear/assignment. + * We must however clear busiest_stat::avg_load because + * update_sd_pick_busiest() reads this before assignment. + */ + *sds = (struct sd_lb_stats){ + .busiest = NULL, + .local = NULL, + .total_load = 0UL, + .total_pwr = 0UL, + .busiest_stat = { + .avg_load = 0UL, + }, + }; +} + /** * get_sd_load_idx - Obtain the load index for a given sched domain. * @sd: The sched_domain whose load_idx is to be obtained. @@ -4615,7 +4634,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, */ if (prefer_sibling && !local_group && sds->local && sds->local_stat.group_has_capacity) - sgs->group_capacity = min(sgs->group_capacity, 1UL); + sgs->group_capacity = min(sgs->group_capacity, 1U); /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; @@ -4846,7 +4865,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) struct sg_lb_stats *local, *busiest; struct sd_lb_stats sds; - memset(&sds, 0, sizeof(sds)); + init_sd_lb_stats(&sds); /* * Compute the various statistics relavent for load balancing at -- cgit v1.3-8-gc7d7 From 38d0f7708543bcfa03d5ee55e8346f801b4a59c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Aug 2013 19:47:56 +0200 Subject: sched/fair: Remove duplicate load_per_task computations Since we already compute (but don't store) the sgs load_per_task value in update_sg_lb_stats() we might as well store it and not re-compute it later on. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ym1vmljiwbzgdnnrwp9azftq@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4c6a8a5a789a..57952198b01e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4476,7 +4476,6 @@ static inline void update_sg_lb_stats(struct lb_env *env, { unsigned long nr_running, max_nr_running, min_nr_running; unsigned long load, max_cpu_load, min_cpu_load; - unsigned long avg_load_per_task = 0; int i; /* Tally up the load of all CPUs in the group */ @@ -4531,9 +4530,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, * the hierarchy? */ if (sgs->sum_nr_running) - avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; + sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && + if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task && (max_nr_running - min_nr_running) > 1) sgs->group_imb = 1; @@ -4776,15 +4775,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s struct sg_lb_stats *local, *busiest; local = &sds->local_stat; - if (local->sum_nr_running) { - local->load_per_task = - local->sum_weighted_load / local->sum_nr_running; - } - busiest = &sds->busiest_stat; - /* busiest must have some tasks */ - busiest->load_per_task = - busiest->sum_weighted_load / busiest->sum_nr_running; if (busiest->group_imb) { busiest->load_per_task = -- cgit v1.3-8-gc7d7 From 3ae11c90fd055ba1b1b03a014f851b395bdd26ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Aug 2013 20:37:48 +0200 Subject: sched/fair: Make group power more consistent For easier access, less dereferences and more consistent value, store the group power in update_sg_lb_stats() and use it thereafter. The actual value in sched_group::sched_group_power::power can change throughout the load-balance pass if we're unlucky. Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-739xxqkyvftrhnh9ncudutc7@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 57952198b01e..ccf20e76b6b2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4239,6 +4239,7 @@ struct sg_lb_stats { unsigned long group_load; /* Total load over the CPUs of the group */ unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; + unsigned long group_power; unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity; unsigned int idle_cpus; @@ -4518,7 +4519,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, update_group_power(env->sd, env->dst_cpu); /* Adjust by relative CPU power of the group */ - sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power; + sgs->group_power = group->sgp->power; + sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; /* * Consider the group unbalanced when the imbalance is larger @@ -4537,7 +4539,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_imb = 1; sgs->group_capacity = - DIV_ROUND_CLOSEST(group->sgp->power, SCHED_POWER_SCALE); + DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); if (!sgs->group_capacity) sgs->group_capacity = fix_small_capacity(env->sd, group); @@ -4637,7 +4639,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, /* Now, start updating sd_lb_stats */ sds->total_load += sgs->group_load; - sds->total_pwr += sg->sgp->power; + sds->total_pwr += sgs->group_power; if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) { sds->busiest = sg; @@ -4685,8 +4687,9 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) if (env->dst_cpu > busiest_cpu) return 0; - env->imbalance = DIV_ROUND_CLOSEST(sds->busiest_stat.avg_load * - sds->busiest->sgp->power, SCHED_POWER_SCALE); + env->imbalance = DIV_ROUND_CLOSEST( + sds->busiest_stat.avg_load * sds->busiest_stat.group_power, + SCHED_POWER_SCALE); return 1; } @@ -4716,7 +4719,7 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) scaled_busy_load_per_task = (busiest->load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; + busiest->group_power; if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { @@ -4730,32 +4733,32 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) * moving them. */ - pwr_now += sds->busiest->sgp->power * + pwr_now += busiest->group_power * min(busiest->load_per_task, busiest->avg_load); - pwr_now += sds->local->sgp->power * + pwr_now += local->group_power * min(local->load_per_task, local->avg_load); pwr_now /= SCHED_POWER_SCALE; /* Amount of load we'd subtract */ tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - sds->busiest->sgp->power; + busiest->group_power; if (busiest->avg_load > tmp) { - pwr_move += sds->busiest->sgp->power * + pwr_move += busiest->group_power * min(busiest->load_per_task, busiest->avg_load - tmp); } /* Amount of load we'd add */ - if (busiest->avg_load * sds->busiest->sgp->power < + if (busiest->avg_load * busiest->group_power < busiest->load_per_task * SCHED_POWER_SCALE) { - tmp = (busiest->avg_load * sds->busiest->sgp->power) / - sds->local->sgp->power; + tmp = (busiest->avg_load * busiest->group_power) / + local->group_power; } else { tmp = (busiest->load_per_task * SCHED_POWER_SCALE) / - sds->local->sgp->power; + local->group_power; } - pwr_move += sds->local->sgp->power * - min(local->load_per_task, local->avg_load + tmp); + pwr_move += local->group_power * + min(local->load_per_task, local->avg_load + tmp); pwr_move /= SCHED_POWER_SCALE; /* Move if we gain throughput */ @@ -4800,7 +4803,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s (busiest->sum_nr_running - busiest->group_capacity); load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE); - load_above_capacity /= sds->busiest->sgp->power; + load_above_capacity /= busiest->group_power; } /* @@ -4818,8 +4821,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s /* How much load to actually move to equalise the imbalance */ env->imbalance = min( - max_pull * sds->busiest->sgp->power, - (sds->avg_load - local->avg_load) * sds->local->sgp->power + max_pull * busiest->group_power, + (sds->avg_load - local->avg_load) * local->group_power ) / SCHED_POWER_SCALE; /* -- cgit v1.3-8-gc7d7 From 6906a40839198f33dbb56d20e644c01e00663952 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 15:20:21 +0200 Subject: sched/fair: Optimize find_busiest_queue() Use for_each_cpu_and() and thereby avoid computing the capacity for CPUs we know we're not interested in. Reviewed-by: Paul Turner Reviewed-by: Preeti U Murthy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-lppceyv6kb3a19g8spmrn20b@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ccf20e76b6b2..bedd30b168a5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4946,7 +4946,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, unsigned long busiest_load = 0, busiest_power = 1; int i; - for_each_cpu(i, sched_group_cpus(group)) { + for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { unsigned long power = power_of(i); unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE); @@ -4955,9 +4955,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (!capacity) capacity = fix_small_capacity(env->sd, group); - if (!cpumask_test_cpu(i, env->cpus)) - continue; - rq = cpu_rq(i); wl = weighted_cpuload(i); -- cgit v1.3-8-gc7d7 From 30ce5dabc92b5a349a7d9e9cf499494d230e0691 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Aug 2013 20:29:29 +0200 Subject: sched/fair: Rework and comment the group_imb code Rik reported some weirdness due to the group_imb code. As a start to looking at it, clean it up a little and add a few explanatory comments. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-caeeqttnla4wrrmhp5uf89gp@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 123 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 89 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bedd30b168a5..dffb27070ddb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4463,6 +4463,81 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) return 0; } +/* + * Group imbalance indicates (and tries to solve) the problem where balancing + * groups is inadequate due to tsk_cpus_allowed() constraints. + * + * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a + * cpumask covering 1 cpu of the first group and 3 cpus of the second group. + * Something like: + * + * { 0 1 2 3 } { 4 5 6 7 } + * * * * * + * + * If we were to balance group-wise we'd place two tasks in the first group and + * two tasks in the second group. Clearly this is undesired as it will overload + * cpu 3 and leave one of the cpus in the second group unused. + * + * The current solution to this issue is detecting the skew in the first group + * by noticing it has a cpu that is overloaded while the remaining cpus are + * idle -- or rather, there's a distinct imbalance in the cpus; see + * sg_imbalanced(). + * + * When this is so detected; this group becomes a candidate for busiest; see + * update_sd_pick_busiest(). And calculcate_imbalance() and + * find_busiest_group() avoid some of the usual balance conditional to allow it + * to create an effective group imbalance. + * + * This is a somewhat tricky proposition since the next run might not find the + * group imbalance and decide the groups need to be balanced again. A most + * subtle and fragile situation. + */ + +struct sg_imb_stats { + unsigned long max_nr_running, min_nr_running; + unsigned long max_cpu_load, min_cpu_load; +}; + +static inline void init_sg_imb_stats(struct sg_imb_stats *sgi) +{ + sgi->max_cpu_load = sgi->max_nr_running = 0UL; + sgi->min_cpu_load = sgi->min_nr_running = ~0UL; +} + +static inline void +update_sg_imb_stats(struct sg_imb_stats *sgi, + unsigned long load, unsigned long nr_running) +{ + if (load > sgi->max_cpu_load) + sgi->max_cpu_load = load; + if (sgi->min_cpu_load > load) + sgi->min_cpu_load = load; + + if (nr_running > sgi->max_nr_running) + sgi->max_nr_running = nr_running; + if (sgi->min_nr_running > nr_running) + sgi->min_nr_running = nr_running; +} + +static inline int +sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi) +{ + /* + * Consider the group unbalanced when the imbalance is larger + * than the average weight of a task. + * + * APZ: with cgroup the avg task weight can vary wildly and + * might not be a suitable number - should we keep a + * normalized nr_running number somewhere that negates + * the hierarchy? + */ + if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task && + (sgi->max_nr_running - sgi->min_nr_running) > 1) + return 1; + + return 0; +} + /** * update_sg_lb_stats - Update sched_group's statistics for load balancing. * @env: The load balancing environment. @@ -4475,15 +4550,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, struct sched_group *group, int load_idx, int local_group, struct sg_lb_stats *sgs) { - unsigned long nr_running, max_nr_running, min_nr_running; - unsigned long load, max_cpu_load, min_cpu_load; + struct sg_imb_stats sgi; + unsigned long nr_running; + unsigned long load; int i; - /* Tally up the load of all CPUs in the group */ - max_cpu_load = 0; - min_cpu_load = ~0UL; - max_nr_running = 0; - min_nr_running = ~0UL; + init_sg_imb_stats(&sgi); for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { struct rq *rq = cpu_rq(i); @@ -4495,16 +4567,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = target_load(i, load_idx); } else { load = source_load(i, load_idx); - - if (load > max_cpu_load) - max_cpu_load = load; - if (min_cpu_load > load) - min_cpu_load = load; - - if (nr_running > max_nr_running) - max_nr_running = nr_running; - if (min_nr_running > nr_running) - min_nr_running = nr_running; + update_sg_imb_stats(&sgi, load, nr_running); } sgs->group_load += load; @@ -4522,21 +4585,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_power = group->sgp->power; sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power; - /* - * Consider the group unbalanced when the imbalance is larger - * than the average weight of a task. - * - * APZ: with cgroup the avg task weight can vary wildly and - * might not be a suitable number - should we keep a - * normalized nr_running number somewhere that negates - * the hierarchy? - */ if (sgs->sum_nr_running) sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; - if ((max_cpu_load - min_cpu_load) >= sgs->load_per_task && - (max_nr_running - min_nr_running) > 1) - sgs->group_imb = 1; + sgs->group_imb = sg_imbalanced(sgs, &sgi); sgs->group_capacity = DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE); @@ -4781,6 +4833,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s busiest = &sds->busiest_stat; if (busiest->group_imb) { + /* + * In the group_imb case we cannot rely on group-wide averages + * to ensure cpu-load equilibrium, look at wider averages. XXX + */ busiest->load_per_task = min(busiest->load_per_task, sds->avg_load); } @@ -4798,6 +4854,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s if (!busiest->group_imb) { /* * Don't want to pull so many tasks that a group would go idle. + * Except of course for the group_imb case, since then we might + * have to drop below capacity to reach cpu-load equilibrium. */ load_above_capacity = (busiest->sum_nr_running - busiest->group_capacity); @@ -4813,11 +4871,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * we also don't want to reduce the group load below the group capacity * (so that we can implement power-savings policies etc). Thus we look * for the minimum possible imbalance. - * Be careful of negative numbers as they'll appear as very large values - * with unsigned longs. */ - max_pull = min(busiest->avg_load - sds->avg_load, - load_above_capacity); + max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity); /* How much load to actually move to equalise the imbalance */ env->imbalance = min( @@ -4881,7 +4936,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) /* * If the busiest group is imbalanced the below checks don't - * work because they assumes all things are equal, which typically + * work because they assume all things are equal, which typically * isn't true due to cpus_allowed constraints and the like. */ if (busiest->group_imb) -- cgit v1.3-8-gc7d7 From 10866e62e8a6907d9072f10f9a0561db0c0cf50b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 19 Aug 2013 16:57:04 +0200 Subject: sched/fair: Fix the sd_parent_degenerate() code I found that on my WSM box I had a redundant domain: [ 0.949769] CPU0 attaching sched-domain: [ 0.953765] domain 0: span 0,12 level SIBLING [ 0.958335] groups: 0 (cpu_power = 587) 12 (cpu_power = 588) [ 0.964548] domain 1: span 0-5,12-17 level MC [ 0.969206] groups: 0,12 (cpu_power = 1175) 1,13 (cpu_power = 1176) 2,14 (cpu_power = 1176) 3,15 (cpu_power = 1176) 4,16 (cpu_power = 1176) 5,17 (cpu_power = 1176) [ 0.984993] domain 2: span 0-5,12-17 level CPU [ 0.989822] groups: 0-5,12-17 (cpu_power = 7055) [ 0.995049] domain 3: span 0-23 level NUMA [ 0.999620] groups: 0-5,12-17 (cpu_power = 7055) 6-11,18-23 (cpu_power = 7056) Note how domain 2 has only a single group and spans the same CPUs as domain 1. We should not keep such domains and do in fact have code to prune these. It turns out that the 'new' SD_PREFER_SIBLING flag causes this, it makes sd_parent_degenerate() fail on the CPU domain. We can easily fix this by 'ignoring' the SD_PREFER_SIBLING bit and transfering it to whatever domain ends up covering the span. With this patch the domains now look like this: [ 0.950419] CPU0 attaching sched-domain: [ 0.954454] domain 0: span 0,12 level SIBLING [ 0.959039] groups: 0 (cpu_power = 587) 12 (cpu_power = 588) [ 0.965271] domain 1: span 0-5,12-17 level MC [ 0.969936] groups: 0,12 (cpu_power = 1175) 1,13 (cpu_power = 1176) 2,14 (cpu_power = 1176) 3,15 (cpu_power = 1176) 4,16 (cpu_power = 1176) 5,17 (cpu_power = 1176) [ 0.985737] domain 2: span 0-23 level NUMA [ 0.990231] groups: 0-5,12-17 (cpu_power = 7055) 6-11,18-23 (cpu_power = 7056) Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-ys201g4jwukj0h8xcamakxq1@git.kernel.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cf8f100433e0..4da0f4bb2ca4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4914,7 +4914,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUPOWER | - SD_SHARE_PKG_RESOURCES); + SD_SHARE_PKG_RESOURCES | + SD_PREFER_SIBLING); if (nr_node_ids == 1) pflags &= ~SD_SERIALIZE; } @@ -5118,6 +5119,13 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) tmp->parent = parent->parent; if (parent->parent) parent->parent->child = tmp; + /* + * Transfer SD_PREFER_SIBLING down in case of a + * degenerate parent; the spans match for this + * so the property transfers. + */ + if (parent->flags & SD_PREFER_SIBLING) + tmp->flags |= SD_PREFER_SIBLING; destroy_sched_domain(parent, cpu); } else tmp = tmp->parent; -- cgit v1.3-8-gc7d7 From 13d7a2410fa637f450a29ecb515ac318ee40c741 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Wed, 21 Aug 2013 12:10:24 +0200 Subject: perf: Add attr->mmap2 attribute to an event Adds a new PERF_RECORD_MMAP2 record type which is essence an expanded version of PERF_RECORD_MMAP. Used to request mmap records with more information about the mapping, including device major, minor and the inode number and generation for mappings associated with files or shared memory segments. Works for code and data (with attr->mmap_data set). Existing PERF_RECORD_MMAP record is unmodified by this patch. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Al Viro Link: http://lkml.kernel.org/r/1377079825-19057-2-git-send-email-eranian@google.com [ Added Al to the Cc:. Are the ino, maj/min exports of vma->vm_file OK? ] Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 24 ++++++++++++++++++++- kernel/events/core.c | 46 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 65 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 42cb7b62ca59..a77f43af72b8 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -276,8 +276,9 @@ struct perf_event_attr { exclude_callchain_kernel : 1, /* exclude kernel callchains */ exclude_callchain_user : 1, /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ - __reserved_1 : 41; + __reserved_1 : 40; union { __u32 wakeup_events; /* wakeup every n events */ @@ -651,6 +652,27 @@ enum perf_event_type { */ PERF_RECORD_SAMPLE = 9, + /* + * The MMAP2 records are an augmented version of MMAP, they add + * maj, min, ino numbers to be used to uniquely identify each mapping + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP2 = 10, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 15d0f2418e54..c7ee497c39a7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4776,7 +4776,7 @@ next: /* * task tracking -- fork/exit * - * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task + * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task */ struct perf_task_event { @@ -4796,8 +4796,9 @@ struct perf_task_event { static int perf_event_task_match(struct perf_event *event) { - return event->attr.comm || event->attr.mmap || - event->attr.mmap_data || event->attr.task; + return event->attr.comm || event->attr.mmap || + event->attr.mmap2 || event->attr.mmap_data || + event->attr.task; } static void perf_event_task_output(struct perf_event *event, @@ -4992,6 +4993,9 @@ struct perf_mmap_event { const char *file_name; int file_size; + int maj, min; + u64 ino; + u64 ino_generation; struct { struct perf_event_header header; @@ -5012,7 +5016,7 @@ static int perf_event_mmap_match(struct perf_event *event, int executable = vma->vm_flags & VM_EXEC; return (!executable && event->attr.mmap_data) || - (executable && event->attr.mmap); + (executable && (event->attr.mmap || event->attr.mmap2)); } static void perf_event_mmap_output(struct perf_event *event, @@ -5027,6 +5031,13 @@ static void perf_event_mmap_output(struct perf_event *event, if (!perf_event_mmap_match(event, data)) return; + if (event->attr.mmap2) { + mmap_event->event_id.header.type = PERF_RECORD_MMAP2; + mmap_event->event_id.header.size += sizeof(mmap_event->maj); + mmap_event->event_id.header.size += sizeof(mmap_event->min); + mmap_event->event_id.header.size += sizeof(mmap_event->ino); + } + perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); ret = perf_output_begin(&handle, event, mmap_event->event_id.header.size); @@ -5037,6 +5048,14 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.tid = perf_event_tid(event, current); perf_output_put(&handle, mmap_event->event_id); + + if (event->attr.mmap2) { + perf_output_put(&handle, mmap_event->maj); + perf_output_put(&handle, mmap_event->min); + perf_output_put(&handle, mmap_event->ino); + perf_output_put(&handle, mmap_event->ino_generation); + } + __output_copy(&handle, mmap_event->file_name, mmap_event->file_size); @@ -5051,6 +5070,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) { struct vm_area_struct *vma = mmap_event->vma; struct file *file = vma->vm_file; + int maj = 0, min = 0; + u64 ino = 0, gen = 0; unsigned int size; char tmp[16]; char *buf = NULL; @@ -5059,6 +5080,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) memset(tmp, 0, sizeof(tmp)); if (file) { + struct inode *inode; + dev_t dev; /* * d_path works from the end of the rb backwards, so we * need to add enough zero bytes after the string to handle @@ -5074,6 +5097,13 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; } + inode = file_inode(vma->vm_file); + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + gen = inode->i_generation; + maj = MAJOR(dev); + min = MINOR(dev); + } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), @@ -5104,6 +5134,10 @@ got_name: mmap_event->file_name = name; mmap_event->file_size = size; + mmap_event->maj = maj; + mmap_event->min = min; + mmap_event->ino = ino; + mmap_event->ino_generation = gen; if (!(vma->vm_flags & VM_EXEC)) mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA; @@ -5140,6 +5174,10 @@ void perf_event_mmap(struct vm_area_struct *vma) .len = vma->vm_end - vma->vm_start, .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT, }, + /* .maj (attr_mmap2 only) */ + /* .min (attr_mmap2 only) */ + /* .ino (attr_mmap2 only) */ + /* .ino_generation (attr_mmap2 only) */ }; perf_event_mmap_event(&mmap_event); -- cgit v1.3-8-gc7d7 From 942e443127e928a5631c3d5102aca8c8b3c2dd98 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Tue, 3 Sep 2013 16:33:57 +0930 Subject: module: Fix mod->mkobj.kobj potentially freed too early DEBUG_KOBJECT_RELEASE helps to find the issue attached below. After some investigation, it seems the reason is: The mod->mkobj.kobj(ffffffffa01600d0 below) is freed together with mod itself in free_module(). However, its children still hold references to it, as the delay caused by DEBUG_KOBJECT_RELEASE. So when the child(holders below) tries to decrease the reference count to its parent in kobject_del(), BUG happens as it tries to access already freed memory. This patch tries to fix it by waiting for the mod->mkobj.kobj to be really released in the module removing process (and some error code paths). [ 1844.175287] kobject: 'holders' (ffff88007c1f1600): kobject_release, parent ffffffffa01600d0 (delayed) [ 1844.178991] kobject: 'notes' (ffff8800370b2a00): kobject_release, parent ffffffffa01600d0 (delayed) [ 1845.180118] kobject: 'holders' (ffff88007c1f1600): kobject_cleanup, parent ffffffffa01600d0 [ 1845.182130] kobject: 'holders' (ffff88007c1f1600): auto cleanup kobject_del [ 1845.184120] BUG: unable to handle kernel paging request at ffffffffa01601d0 [ 1845.185026] IP: [] kobject_put+0x11/0x60 [ 1845.185026] PGD 1a13067 PUD 1a14063 PMD 7bd30067 PTE 0 [ 1845.185026] Oops: 0000 [#1] PREEMPT [ 1845.185026] Modules linked in: xfs libcrc32c [last unloaded: kprobe_example] [ 1845.185026] CPU: 0 PID: 18 Comm: kworker/0:1 Tainted: G O 3.11.0-rc6-next-20130819+ #1 [ 1845.185026] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 1845.185026] Workqueue: events kobject_delayed_cleanup [ 1845.185026] task: ffff88007ca51f00 ti: ffff88007ca5c000 task.ti: ffff88007ca5c000 [ 1845.185026] RIP: 0010:[] [] kobject_put+0x11/0x60 [ 1845.185026] RSP: 0018:ffff88007ca5dd08 EFLAGS: 00010282 [ 1845.185026] RAX: 0000000000002000 RBX: ffffffffa01600d0 RCX: ffffffff8177d638 [ 1845.185026] RDX: ffff88007ca5dc18 RSI: 0000000000000000 RDI: ffffffffa01600d0 [ 1845.185026] RBP: ffff88007ca5dd18 R08: ffffffff824e9810 R09: ffffffffffffffff [ 1845.185026] R10: ffff8800ffffffff R11: dead4ead00000001 R12: ffffffff81a95040 [ 1845.185026] R13: ffff88007b27a960 R14: ffff88007c1f1600 R15: 0000000000000000 [ 1845.185026] FS: 0000000000000000(0000) GS:ffffffff81a23000(0000) knlGS:0000000000000000 [ 1845.185026] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 1845.185026] CR2: ffffffffa01601d0 CR3: 0000000037207000 CR4: 00000000000006b0 [ 1845.185026] Stack: [ 1845.185026] ffff88007c1f1600 ffff88007c1f1600 ffff88007ca5dd38 ffffffff812cdb7e [ 1845.185026] 0000000000000000 ffff88007c1f1640 ffff88007ca5dd68 ffffffff812cdbfe [ 1845.185026] ffff88007c974800 ffff88007c1f1640 ffff88007ff61a00 0000000000000000 [ 1845.185026] Call Trace: [ 1845.185026] [] kobject_del+0x2e/0x40 [ 1845.185026] [] kobject_delayed_cleanup+0x6e/0x1d0 [ 1845.185026] [] process_one_work+0x1e5/0x670 [ 1845.185026] [] ? process_one_work+0x183/0x670 [ 1845.185026] [] worker_thread+0x113/0x370 [ 1845.185026] [] ? rescuer_thread+0x290/0x290 [ 1845.185026] [] kthread+0xda/0xe0 [ 1845.185026] [] ? _raw_spin_unlock_irq+0x30/0x60 [ 1845.185026] [] ? kthread_create_on_node+0x130/0x130 [ 1845.185026] [] ret_from_fork+0x7a/0xb0 [ 1845.185026] [] ? kthread_create_on_node+0x130/0x130 [ 1845.185026] Code: 81 48 c7 c7 28 95 ad 81 31 c0 e8 9b da 01 00 e9 4f ff ff ff 66 0f 1f 44 00 00 55 48 89 e5 53 48 89 fb 48 83 ec 08 48 85 ff 74 1d 87 00 01 00 00 01 74 1e 48 8d 7b 38 83 6b 38 01 0f 94 c0 84 [ 1845.185026] RIP [] kobject_put+0x11/0x60 [ 1845.185026] RSP [ 1845.185026] CR2: ffffffffa01601d0 [ 1845.185026] ---[ end trace 49a70afd109f5653 ]--- Signed-off-by: Li Zhong Acked-by: Greg Kroah-Hartman Signed-off-by: Rusty Russell --- include/linux/module.h | 1 + kernel/module.c | 14 +++++++++++--- kernel/params.c | 7 +++++++ 3 files changed, 19 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 504035f3ece1..05f2447f8c15 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -42,6 +42,7 @@ struct module_kobject { struct module *mod; struct kobject *drivers_dir; struct module_param_attrs *mp; + struct completion *kobj_completion; }; struct module_attribute { diff --git a/kernel/module.c b/kernel/module.c index 40ee1dc3c3bf..9f5ddae72f44 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1612,6 +1612,14 @@ static void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } +static void mod_kobject_put(struct module *mod) +{ + DECLARE_COMPLETION_ONSTACK(c); + mod->mkobj.kobj_completion = &c; + kobject_put(&mod->mkobj.kobj); + wait_for_completion(&c); +} + static int mod_sysfs_init(struct module *mod) { int err; @@ -1639,7 +1647,7 @@ static int mod_sysfs_init(struct module *mod) err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, "%s", mod->name); if (err) - kobject_put(&mod->mkobj.kobj); + mod_kobject_put(mod); /* delay uevent until full sysfs population */ out: @@ -1683,7 +1691,7 @@ out_unreg_param: out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: - kobject_put(&mod->mkobj.kobj); + mod_kobject_put(mod); out: return err; } @@ -1692,7 +1700,7 @@ static void mod_sysfs_fini(struct module *mod) { remove_notes_attrs(mod); remove_sect_attrs(mod); - kobject_put(&mod->mkobj.kobj); + mod_kobject_put(mod); } #else /* !CONFIG_SYSFS */ diff --git a/kernel/params.c b/kernel/params.c index e5f8f17e57cf..501bde4f3bee 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -915,7 +915,14 @@ static const struct kset_uevent_ops module_uevent_ops = { struct kset *module_kset; int module_sysfs_initialized; +static void module_kobj_release(struct kobject *kobj) +{ + struct module_kobject *mk = to_module_kobject(kobj); + complete(mk->kobj_completion); +} + struct kobj_type module_ktype = { + .release = module_kobj_release, .sysfs_ops = &module_sysfs_ops, }; -- cgit v1.3-8-gc7d7 From 59338f754a55f07857342dbcd81652a4f091d72f Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sat, 31 Aug 2013 01:04:07 -0400 Subject: ftrace: Fix a slight race in modifying what function callback gets traced There's a slight race when going from a list function to a non list function. That is, when only one callback is registered to the function tracer, it gets called directly by the mcount trampoline. But if this function has filters, it may be called by the wrong functions. As the list ops callback that handles multiple callbacks that are registered to ftrace, it also handles what functions they call. While the transaction is taking place, use the list function always, and after all the updates are finished (only the functions that should be traced are being traced), then we can update the trampoline to call the function directly. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a6d098c6df3f..03cf44ac54d3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1978,12 +1978,27 @@ int __weak ftrace_arch_code_modify_post_process(void) void ftrace_modify_all_code(int command) { + int update = command & FTRACE_UPDATE_TRACE_FUNC; + + /* + * If the ftrace_caller calls a ftrace_ops func directly, + * we need to make sure that it only traces functions it + * expects to trace. When doing the switch of functions, + * we need to update to the ftrace_ops_list_func first + * before the transition between old and new calls are set, + * as the ftrace_ops_list_func will check the ops hashes + * to make sure the ops are having the right functions + * traced. + */ + if (update) + ftrace_update_ftrace_func(ftrace_ops_list_func); + if (command & FTRACE_UPDATE_CALLS) ftrace_replace_code(1); else if (command & FTRACE_DISABLE_CALLS) ftrace_replace_code(0); - if (command & FTRACE_UPDATE_TRACE_FUNC) + if (update && ftrace_trace_function != ftrace_ops_list_func) ftrace_update_ftrace_func(ftrace_trace_function); if (command & FTRACE_START_FUNC_RET) -- cgit v1.3-8-gc7d7 From a2e0578be3652406b2ffd2eeb31cdbdffa325d64 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Aug 2013 12:41:41 -0400 Subject: switch copy_module_from_fd() to fdget Signed-off-by: Al Viro --- kernel/module.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 206915830d29..c6756d1c6d73 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2540,21 +2540,20 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, /* Sets info->hdr and info->len. */ static int copy_module_from_fd(int fd, struct load_info *info) { - struct file *file; + struct fd f = fdget(fd); int err; struct kstat stat; loff_t pos; ssize_t bytes = 0; - file = fget(fd); - if (!file) + if (!f.file) return -ENOEXEC; - err = security_kernel_module_from_file(file); + err = security_kernel_module_from_file(f.file); if (err) goto out; - err = vfs_getattr(&file->f_path, &stat); + err = vfs_getattr(&f.file->f_path, &stat); if (err) goto out; @@ -2577,7 +2576,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) pos = 0; while (pos < stat.size) { - bytes = kernel_read(file, pos, (char *)(info->hdr) + pos, + bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, stat.size - pos); if (bytes < 0) { vfree(info->hdr); @@ -2591,7 +2590,7 @@ static int copy_module_from_fd(int fd, struct load_info *info) info->len = pos; out: - fput(file); + fdput(f); return err; } -- cgit v1.3-8-gc7d7 From 5a8e01f8fa51f5cbce8f37acc050eb2319d12956 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 4 Sep 2013 15:16:03 +0200 Subject: sched/cputime: Do not scale when utime == 0 scale_stime() silently assumes that stime < rtime, otherwise when stime == rtime and both values are big enough (operations on them do not fit in 32 bits), the resulting scaling stime can be bigger than rtime. In consequence utime = rtime - stime results in negative value. User space visible symptoms of the bug are overflowed TIME values on ps/top, for example: $ ps aux | grep rcu root 8 0.0 0.0 0 0 ? S 12:42 0:00 [rcuc/0] root 9 0.0 0.0 0 0 ? S 12:42 0:00 [rcub/0] root 10 62422329 0.0 0 0 ? R 12:42 21114581:37 [rcu_preempt] root 11 0.1 0.0 0 0 ? S 12:42 0:02 [rcuop/0] root 12 62422329 0.0 0 0 ? S 12:42 21114581:35 [rcuop/1] root 10 62422329 0.0 0 0 ? R 12:42 21114581:37 [rcu_preempt] or overflowed utime values read directly from /proc/$PID/stat Reference: https://lkml.org/lkml/2013/8/20/259 Reported-and-tested-by: Sergey Senozhatsky Signed-off-by: Stanislaw Gruszka Cc: stable@vger.kernel.org Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul E. McKenney Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20130904131602.GC2564@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index c1d7493825ae..5b03f5bebabc 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -551,10 +551,7 @@ static void cputime_adjust(struct task_cputime *curr, struct cputime *prev, cputime_t *ut, cputime_t *st) { - cputime_t rtime, stime, utime, total; - - stime = curr->stime; - total = stime + curr->utime; + cputime_t rtime, stime, utime; /* * Tick based cputime accounting depend on random scheduling @@ -576,13 +573,19 @@ static void cputime_adjust(struct task_cputime *curr, if (prev->stime + prev->utime >= rtime) goto out; - if (total) { + stime = curr->stime; + utime = curr->utime; + + if (utime == 0) { + stime = rtime; + } else if (stime == 0) { + utime = rtime; + } else { + cputime_t total = stime + utime; + stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)total); utime = rtime - stime; - } else { - stime = rtime; - utime = 0; } /* -- cgit v1.3-8-gc7d7 From a0a5a0561f63905fe94c49bc567615829f42ce1e Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Sat, 31 Aug 2013 01:04:07 -0400 Subject: ftrace/rcu: Do not trace debug_lockdep_rcu_enabled() The function debug_lockdep_rcu_enabled() is part of the RCU lockdep debugging, and is called very frequently. I found that if I enable a lot of debugging and run the function graph tracer, this function can cause a live lock of the system. We don't usually trace lockdep infrastructure, no need to trace this either. Reviewed-by: Paul E. McKenney Signed-off-by: Steven Rostedt --- kernel/rcupdate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index cce6ba8bbace..4f20c6c6a848 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -122,7 +122,7 @@ struct lockdep_map rcu_sched_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); EXPORT_SYMBOL_GPL(rcu_sched_lock_map); -int debug_lockdep_rcu_enabled(void) +int notrace debug_lockdep_rcu_enabled(void) { return rcu_scheduler_active && debug_locks && current->lockdep_recursion == 0; -- cgit v1.3-8-gc7d7 From 4e10f3c98888ee88ea2543aa636db6410fa47477 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 30 Aug 2013 12:29:49 -0400 Subject: Kill indirect include of file.h from eventfd.h, use fdget() in cgroup.c kernel/cgroup.c is the only place in the tree that relies on eventfd.h pulling file.h; move that include there. Switch from eventfd_fget()/fput() to fdget()/fdput(), while we are at it - eventfd_ctx_fileget() will fail on non-eventfd descriptors just fine, no need to do that check twice... Signed-off-by: Al Viro --- include/linux/eventfd.h | 3 ++- kernel/cgroup.c | 33 +++++++++++++++++---------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index cf5d2af61b81..ff0b981f078e 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -9,7 +9,6 @@ #define _LINUX_EVENTFD_H #include -#include #include /* @@ -26,6 +25,8 @@ #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE) +struct file; + #ifdef CONFIG_EVENTFD struct file *eventfd_file_create(unsigned int count, int flags); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0aeb32415ff..2418b6e71a85 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -60,6 +60,7 @@ #include #include /* used in cgroup_attach_task */ #include +#include #include @@ -4034,8 +4035,8 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, struct cgroup_event *event; struct cgroup_subsys_state *cfile_css; unsigned int efd, cfd; - struct file *efile; - struct file *cfile; + struct fd efile; + struct fd cfile; char *endp; int ret; @@ -4058,31 +4059,31 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, init_waitqueue_func_entry(&event->wait, cgroup_event_wake); INIT_WORK(&event->remove, cgroup_event_remove); - efile = eventfd_fget(efd); - if (IS_ERR(efile)) { - ret = PTR_ERR(efile); + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; goto out_kfree; } - event->eventfd = eventfd_ctx_fileget(efile); + event->eventfd = eventfd_ctx_fileget(efile.file); if (IS_ERR(event->eventfd)) { ret = PTR_ERR(event->eventfd); goto out_put_efile; } - cfile = fget(cfd); - if (!cfile) { + cfile = fdget(cfd); + if (!cfile.file) { ret = -EBADF; goto out_put_eventfd; } /* the process need read permission on control file */ /* AV: shouldn't we check that it's been opened for read instead? */ - ret = inode_permission(file_inode(cfile), MAY_READ); + ret = inode_permission(file_inode(cfile.file), MAY_READ); if (ret < 0) goto out_put_cfile; - event->cft = __file_cft(cfile); + event->cft = __file_cft(cfile.file); if (IS_ERR(event->cft)) { ret = PTR_ERR(event->cft); goto out_put_cfile; @@ -4103,7 +4104,7 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, ret = -EINVAL; event->css = cgroup_css(cgrp, event->cft->ss); - cfile_css = css_from_dir(cfile->f_dentry->d_parent, event->cft->ss); + cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss); if (event->css && event->css == cfile_css && css_tryget(event->css)) ret = 0; @@ -4121,25 +4122,25 @@ static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css, if (ret) goto out_put_css; - efile->f_op->poll(efile, &event->pt); + efile.file->f_op->poll(efile.file, &event->pt); spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); - fput(cfile); - fput(efile); + fdput(cfile); + fdput(efile); return 0; out_put_css: css_put(event->css); out_put_cfile: - fput(cfile); + fdput(cfile); out_put_eventfd: eventfd_ctx_put(event->eventfd); out_put_efile: - fput(efile); + fdput(efile); out_kfree: kfree(event); -- cgit v1.3-8-gc7d7 From b0cff9d88ce2f3030f73138078c5b1019f17e1cc Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 10 Sep 2013 15:54:49 +0900 Subject: sched: Fix load balancing performance regression in should_we_balance() Commit 23f0d20 ("sched: Factor out code to should_we_balance()") introduces the should_we_balance() function. This function should return 1 if this cpu is appropriate for balancing. But the newly introduced code doesn't do so, it returns 0 instead of 1. This introduces performance regression, reported by Dave Chinner: v4 filesystem v5 filesystem 3.11+xfsdev: 220k files/s 225k files/s 3.12-git 180k files/s 185k files/s 3.12-git-revert 245k files/s 247k files/s You can find more detailed information at: https://lkml.org/lkml/2013/9/10/1 This patch corrects the return value of should_we_balance() function as orignally intended. With this patch, Dave Chinner reports that the regression is gone: v4 filesystem v5 filesystem 3.11+xfsdev: 220k files/s 225k files/s 3.12-git 180k files/s 185k files/s 3.12-git-revert 245k files/s 247k files/s 3.12-git-fix 249k files/s 248k files/s Reported-by: Dave Chinner Tested-by: Dave Chinner Signed-off-by: Joonsoo Kim Cc: Paul Turner Cc: Peter Zijlstra Cc: Dave Chinner Link: http://lkml.kernel.org/r/20130910065448.GA20368@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f0a5e6cdae0..9b3fe1cd8f40 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5151,7 +5151,7 @@ static int should_we_balance(struct lb_env *env) * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above domains. */ - return balance_cpu != env->dst_cpu; + return balance_cpu == env->dst_cpu; } /* -- cgit v1.3-8-gc7d7 From 58b79a91f57efec9457de8ff93a4cc4fb8daf753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Sep 2013 15:31:08 -0400 Subject: cgroup: fix cgroup post-order descendant walk of empty subtree bd8815a6d8 ("cgroup: make css_for_each_descendant() and friends include the origin css in the iteration") updated cgroup descendant iterators to include the origin css; unfortuantely, it forgot to drop special case handling in css_next_descendant_post() for empty subtree leading to failure to visit the origin css without any child. Fix it by dropping the special case handling and always returning the leftmost descendant on the first iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0aeb32415ff..8075b72d22be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3187,11 +3187,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, visit the leftmost descendant */ - if (!pos) { - next = css_leftmost_descendant(root); - return next != root ? next : NULL; - } + /* if first iteration, visit leftmost descendant which may be @root */ + if (!pos) + return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) -- cgit v1.3-8-gc7d7 From 3942c07ccf98e66b8893f396dca98f5b076f905f Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Wed, 28 Aug 2013 10:17:53 +1000 Subject: fs: bump inode and dentry counters to long MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This series reworks our current object cache shrinking infrastructure in two main ways: * Noticing that a lot of users copy and paste their own version of LRU lists for objects, we put some effort in providing a generic version. It is modeled after the filesystem users: dentries, inodes, and xfs (for various tasks), but we expect that other users could benefit in the near future with little or no modification. Let us know if you have any issues. * The underlying list_lru being proposed automatically and transparently keeps the elements in per-node lists, and is able to manipulate the node lists individually. Given this infrastructure, we are able to modify the up-to-now hammer called shrink_slab to proceed with node-reclaim instead of always searching memory from all over like it has been doing. Per-node lru lists are also expected to lead to less contention in the lru locks on multi-node scans, since we are now no longer fighting for a global lock. The locks usually disappear from the profilers with this change. Although we have no official benchmarks for this version - be our guest to independently evaluate this - earlier versions of this series were performance tested (details at http://permalink.gmane.org/gmane.linux.kernel.mm/100537) yielding no visible performance regressions while yielding a better qualitative behavior in NUMA machines. With this infrastructure in place, we can use the list_lru entry point to provide memcg isolation and per-memcg targeted reclaim. Historically, those two pieces of work have been posted together. This version presents only the infrastructure work, deferring the memcg work for a later time, so we can focus on getting this part tested. You can see more about the history of such work at http://lwn.net/Articles/552769/ Dave Chinner (18): dcache: convert dentry_stat.nr_unused to per-cpu counters dentry: move to per-sb LRU locks dcache: remove dentries from LRU before putting on dispose list mm: new shrinker API shrinker: convert superblock shrinkers to new API list: add a new LRU list type inode: convert inode lru list to generic lru list code. dcache: convert to use new lru list infrastructure list_lru: per-node list infrastructure shrinker: add node awareness fs: convert inode and dentry shrinking to be node aware xfs: convert buftarg LRU to generic code xfs: rework buffer dispose list tracking xfs: convert dquot cache lru to list_lru fs: convert fs shrinkers to new scan/count API drivers: convert shrinkers to new count/scan API shrinker: convert remaining shrinkers to count/scan API shrinker: Kill old ->shrink API. Glauber Costa (7): fs: bump inode and dentry counters to long super: fix calculation of shrinkable objects for small numbers list_lru: per-node API vmscan: per-node deferred work i915: bail out earlier when shrinker cannot acquire mutex hugepage: convert huge zero page shrinker to new shrinker API list_lru: dynamically adjust node arrays This patch: There are situations in very large machines in which we can have a large quantity of dirty inodes, unused dentries, etc. This is particularly true when umounting a filesystem, where eventually since every live object will eventually be discarded. Dave Chinner reported a problem with this while experimenting with the shrinker revamp patchset. So we believe it is time for a change. This patch just moves int to longs. Machines where it matters should have a big long anyway. Signed-off-by: Glauber Costa Cc: Dave Chinner Cc: "Theodore Ts'o" Cc: Adrian Hunter Cc: Al Viro Cc: Artem Bityutskiy Cc: Arve Hjønnevåg Cc: Carlos Maiolino Cc: Christoph Hellwig Cc: Chuck Lever Cc: Daniel Vetter Cc: Dave Chinner Cc: David Rientjes Cc: Gleb Natapov Cc: Greg Thelen Cc: J. Bruce Fields Cc: Jan Kara Cc: Jerome Glisse Cc: John Stultz Cc: KAMEZAWA Hiroyuki Cc: Kent Overstreet Cc: Kirill A. Shutemov Cc: Marcelo Tosatti Cc: Mel Gorman Cc: Steven Whitehouse Cc: Thomas Hellstrom Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++---- fs/inode.c | 18 +++++++++--------- fs/internal.h | 2 +- include/linux/dcache.h | 10 +++++----- include/linux/fs.h | 4 ++-- include/uapi/linux/fs.h | 6 +++--- kernel/sysctl.c | 6 +++--- 7 files changed, 27 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/fs/dcache.c b/fs/dcache.c index 4d9df3c940e6..6ef1c2e1bbc4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -146,13 +146,13 @@ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; -static DEFINE_PER_CPU(unsigned int, nr_dentry); +static DEFINE_PER_CPU(long, nr_dentry); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -static int get_nr_dentry(void) +static long get_nr_dentry(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; @@ -162,7 +162,7 @@ int proc_nr_dentry(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif diff --git a/fs/inode.c b/fs/inode.c index 93a0625b46e4..2a3c37ea823d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -70,33 +70,33 @@ EXPORT_SYMBOL(empty_aops); */ struct inodes_stat_t inodes_stat; -static DEFINE_PER_CPU(unsigned int, nr_inodes); -static DEFINE_PER_CPU(unsigned int, nr_unused); +static DEFINE_PER_CPU(unsigned long, nr_inodes); +static DEFINE_PER_CPU(unsigned long, nr_unused); static struct kmem_cache *inode_cachep __read_mostly; -static int get_nr_inodes(void) +static long get_nr_inodes(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_inodes, i); return sum < 0 ? 0 : sum; } -static inline int get_nr_inodes_unused(void) +static inline long get_nr_inodes_unused(void) { int i; - int sum = 0; + long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_unused, i); return sum < 0 ? 0 : sum; } -int get_nr_dirty_inodes(void) +long get_nr_dirty_inodes(void) { /* not actually dirty inodes, but a wild approximation */ - int nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); + long nr_dirty = get_nr_inodes() - get_nr_inodes_unused(); return nr_dirty > 0 ? nr_dirty : 0; } @@ -109,7 +109,7 @@ int proc_nr_inodes(ctl_table *table, int write, { inodes_stat.nr_inodes = get_nr_inodes(); inodes_stat.nr_unused = get_nr_inodes_unused(); - return proc_dointvec(table, write, buffer, lenp, ppos); + return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif diff --git a/fs/internal.h b/fs/internal.h index 2be46ea5dd0b..b6495659d6e8 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -121,7 +121,7 @@ extern void inode_add_lru(struct inode *inode); */ extern void inode_wb_list_del(struct inode *inode); -extern int get_nr_dirty_inodes(void); +extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); extern int invalidate_inodes(struct super_block *, bool); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index feaa8d88eef7..844a1ef387e4 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -55,11 +55,11 @@ struct qstr { #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) struct dentry_stat_t { - int nr_dentry; - int nr_unused; - int age_limit; /* age in seconds */ - int want_pages; /* pages requested by system */ - int dummy[2]; + long nr_dentry; + long nr_unused; + long age_limit; /* age in seconds */ + long want_pages; /* pages requested by system */ + long dummy[2]; }; extern struct dentry_stat_t dentry_stat; diff --git a/include/linux/fs.h b/include/linux/fs.h index 49e71b0f0e9f..3b3edac75df2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1271,12 +1271,12 @@ struct super_block { struct list_head s_mounts; /* list of mounts; _not_ for fs use */ /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ struct list_head s_dentry_lru; /* unused dentry lru */ - int s_nr_dentry_unused; /* # of dentry on lru */ + long s_nr_dentry_unused; /* # of dentry on lru */ /* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */ spinlock_t s_inode_lru_lock ____cacheline_aligned_in_smp; struct list_head s_inode_lru; /* unused inode lru */ - int s_nr_inodes_unused; /* # of inodes on lru */ + long s_nr_inodes_unused; /* # of inodes on lru */ struct block_device *s_bdev; struct backing_dev_info *s_bdi; diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index a4ed56cf0eac..6c28b61bb690 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -49,9 +49,9 @@ struct files_stat_struct { }; struct inodes_stat_t { - int nr_inodes; - int nr_unused; - int dummy[5]; /* padding for sysctl ABI compatibility */ + long nr_inodes; + long nr_unused; + long dummy[5]; /* padding for sysctl ABI compatibility */ }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..7822cd88a95c 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,14 +1471,14 @@ static struct ctl_table fs_table[] = { { .procname = "inode-nr", .data = &inodes_stat, - .maxlen = 2*sizeof(int), + .maxlen = 2*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, { .procname = "inode-state", .data = &inodes_stat, - .maxlen = 7*sizeof(int), + .maxlen = 7*sizeof(long), .mode = 0444, .proc_handler = proc_nr_inodes, }, @@ -1508,7 +1508,7 @@ static struct ctl_table fs_table[] = { { .procname = "dentry-state", .data = &dentry_stat, - .maxlen = 6*sizeof(int), + .maxlen = 6*sizeof(long), .mode = 0444, .proc_handler = proc_nr_dentry, }, -- cgit v1.3-8-gc7d7 From d008d5258e9c1a1b7ee6547b8d444323aef331b3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 10 Sep 2013 10:24:05 -0300 Subject: perf: Fix up MMAP2 buffer space reservation The ino_generation field was added in the PERF_RECORD_MMAP2 record in the 13d7a24 cset but no space for it was allocated, corrupting the PERF_FORMAT_{TIME,CPU,TID,etc} area (sample_type/sample_id_all), fix it. Detected with one of the regression tests done by 'perf test': [root@sandy ~]# perf test -v 7 7: Validate PERF_RECORD_* events & perf_sample fields : --- start --- 61315294449606 0 PERF_RECORD_SAMPLE 61315294453161 0 PERF_RECORD_SAMPLE 61315294454441 0 PERF_RECORD_SAMPLE 61315294455709 0 PERF_RECORD_SAMPLE 61315295600899 0 PERF_RECORD_COMM: sleep:6500 27917287430500 342521613 PERF_RECORD_MMAP2 6500/6500: [0x400000(0x7000) @ 0 00:1d 311442 9016]: /usr/bin/sleep MMAP2 going backwards in time, prev=61315295600899, curr=27917287430500 MMAP2 with unexpected cpu, expected 0, got 342521613 MMAP2 with unexpected pid, expected 6500, got 1701606191 MMAP2 with unexpected tid, expected 6500, got 28773 27917287430500 342561333 PERF_RECORD_MMAP2 6500/6500: [0x3b7e000000(0x223000) @ 0 00:1d 309186 9016]: /usr/lib64/ld-2.16.so MMAP2 with unexpected cpu, expected 0, got 342561333 MMAP2 with unexpected pid, expected 6500, got 1932408369 MMAP2 with unexpected tid, expected 6500, got 111 27917287430500 342600095 PERF_RECORD_MMAP2 6500/6500: [0x7fffbd7dc000(0x1000) @ 0x7fffbd7dc000 00:00 0 0]: [vdso] MMAP2 with unexpected cpu, expected 0, got 342600095 MMAP2 with unexpected pid, expected 6500, got 1935963739 MMAP2 with unexpected tid, expected 6500, got 23919 27917287430500 342882834 PERF_RECORD_MMAP2 6500/6500: [0x3b7e400000(0x3b8000) @ 0 00:1d 309187 9016]: /usr/lib64/libc-2.16.so MMAP2 with unexpected cpu, expected 0, got 342882834 MMAP2 with unexpected pid, expected 6500, got 909192754 MMAP2 with unexpected tid, expected 6500, got 7303982 61316297195411 0 PERF_RECORD_EXIT(6500:6500):(6500:6500) ---- end ---- Validate PERF_RECORD_* events & perf_sample fields: FAILED! [root@sandy ~]# After this patch: [root@sandy ~]# perf test 7 7: Validate PERF_RECORD_* events & perf_sample fields : Ok [root@sandy ~]# Acked-by: Peter Zijlstra Acked-by: Stephane Eranian Cc: Adrian Hunter Cc: David Ahern Cc: Frederic Weisbecker Cc: Jiri Olsa Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-heeuv986b8ha7whqg4o3he7c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 2207efc941d1..dd236b66ca3a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5039,6 +5039,7 @@ static void perf_event_mmap_output(struct perf_event *event, mmap_event->event_id.header.size += sizeof(mmap_event->maj); mmap_event->event_id.header.size += sizeof(mmap_event->min); mmap_event->event_id.header.size += sizeof(mmap_event->ino); + mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation); } perf_event_header__init_id(&mmap_event->event_id.header, &sample, event); -- cgit v1.3-8-gc7d7 From e79f525e99b04390ca4d2366309545a836c03bf1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:19:38 -0700 Subject: pidns: fix vfork() after unshare(CLONE_NEWPID) Commit 8382fcac1b81 ("pidns: Outlaw thread creation after unshare(CLONE_NEWPID)") nacks CLONE_VM if the forking process unshared pid_ns, this obviously breaks vfork: int main(void) { assert(unshare(CLONE_NEWUSER | CLONE_NEWPID) == 0); assert(vfork() >= 0); _exit(0); return 0; } fails without this patch. Change this check to use CLONE_SIGHAND instead. This also forbids CLONE_THREAD automatically, and this is what the comment implies. We could probably even drop CLONE_SIGHAND and use CLONE_THREAD, but it would be safer to not do this. The current check denies CLONE_SIGHAND implicitely and there is no reason to change this. Eric said "CLONE_SIGHAND is fine. CLONE_THREAD would be even better. Having shared signal handling between two different pid namespaces is the case that we are fundamentally guarding against." Signed-off-by: Oleg Nesterov Reported-by: Colin Walters Acked-by: Andy Lutomirski Reviewed-by: "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index c9eaf2013002..3561391ca450 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,10 +1173,11 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); /* - * If the new process will be in a different pid namespace - * don't allow the creation of threads. + * If the new process will be in a different pid namespace don't + * allow it to share a thread group or signal handlers with the + * forking task. */ - if ((clone_flags & (CLONE_VM|CLONE_NEWPID)) && + if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) && (task_active_pid_ns(current) != current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); -- cgit v1.3-8-gc7d7 From 5167246a8ad617df55717c2d901da5e2aedffcfa Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:19:40 -0700 Subject: pidns: kill the unnecessary CLONE_NEWPID in copy_process() Commit 8382fcac1b81 ("pidns: Outlaw thread creation after unshare(CLONE_NEWPID)") nacks CLONE_NEWPID if the forking process unshared pid_ns. This is correct but unnecessary, copy_pid_ns() does the same check. Remove the CLONE_NEWPID check to cleanup the code and prepare for the next change. Test-case: static int child(void *arg) { return 0; } static char stack[16 * 1024]; int main(void) { pid_t pid; assert(unshare(CLONE_NEWUSER | CLONE_NEWPID) == 0); pid = clone(child, stack + sizeof(stack) / 2, CLONE_NEWPID | SIGCHLD, NULL); assert(pid < 0 && errno == EINVAL); return 0; } clone(CLONE_NEWPID) correctly fails with or without this change. Signed-off-by: Oleg Nesterov Acked-by: Andy Lutomirski Cc: "Eric W. Biederman" Cc: Colin Walters Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 3561391ca450..68d508f2bfba 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1177,9 +1177,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, * allow it to share a thread group or signal handlers with the * forking task. */ - if ((clone_flags & (CLONE_SIGHAND | CLONE_NEWPID)) && - (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) + if ((clone_flags & CLONE_SIGHAND) && (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) return ERR_PTR(-EINVAL); retval = security_task_create(clone_flags); -- cgit v1.3-8-gc7d7 From 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:19:41 -0700 Subject: fork: unify and tighten up CLONE_NEWUSER/CLONE_NEWPID checks do_fork() denies CLONE_THREAD | CLONE_PARENT if NEWUSER | NEWPID. Then later copy_process() denies CLONE_SIGHAND if the new process will be in a different pid namespace (task_active_pid_ns() doesn't match current->nsproxy->pid_ns). This looks confusing and inconsistent. CLONE_NEWPID is very similar to the case when ->pid_ns was already unshared, we want the same restrictions so copy_process() should also nack CLONE_PARENT. And it would be better to deny CLONE_NEWUSER && CLONE_SIGHAND as well just for consistency. Kill the "CLONE_NEWUSER | CLONE_NEWPID" check in do_fork() and change copy_process() to do the same check along with ->pid_ns check we already have. Signed-off-by: Oleg Nesterov Acked-by: Andy Lutomirski Cc: "Eric W. Biederman" Cc: Colin Walters Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 68d508f2bfba..84703db06cf3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1173,13 +1173,16 @@ static struct task_struct *copy_process(unsigned long clone_flags, return ERR_PTR(-EINVAL); /* - * If the new process will be in a different pid namespace don't - * allow it to share a thread group or signal handlers with the - * forking task. + * If the new process will be in a different pid or user namespace + * do not allow it to share a thread group or signal handlers or + * parent with the forking task. */ - if ((clone_flags & CLONE_SIGHAND) && (task_active_pid_ns(current) != - current->nsproxy->pid_ns_for_children)) - return ERR_PTR(-EINVAL); + if (clone_flags & (CLONE_SIGHAND | CLONE_PARENT)) { + if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) || + (task_active_pid_ns(current) != + current->nsproxy->pid_ns_for_children)) + return ERR_PTR(-EINVAL); + } retval = security_task_create(clone_flags); if (retval) @@ -1575,15 +1578,6 @@ long do_fork(unsigned long clone_flags, int trace = 0; long nr; - /* - * Do some preliminary argument and permissions checking before we - * actually start allocating stuff - */ - if (clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) { - if (clone_flags & (CLONE_THREAD|CLONE_PARENT)) - return -EINVAL; - } - /* * Determine whether and which event to report to ptracer. When * called from kernel_thread or CLONE_UNTRACED is explicitly -- cgit v1.3-8-gc7d7 From ef0855d334e1e4af7c3e0c42146a8479ea14a5ab Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:20:14 -0700 Subject: mm: mempolicy: turn vma_set_policy() into vma_dup_policy() Simple cleanup. Every user of vma_set_policy() does the same work, this looks a bit annoying imho. And the new trivial helper which does mpol_dup() + vma_set_policy() to simplify the callers. Signed-off-by: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Cc: Andi Kleen Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 9 +++++++-- kernel/fork.c | 9 +++------ mm/mempolicy.c | 10 ++++++++++ mm/mmap.c | 17 +++++------------ 4 files changed, 25 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 0d7df39a5885..b2f897789838 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -91,7 +91,6 @@ static inline struct mempolicy *mpol_dup(struct mempolicy *pol) } #define vma_policy(vma) ((vma)->vm_policy) -#define vma_set_policy(vma, pol) ((vma)->vm_policy = (pol)) static inline void mpol_get(struct mempolicy *pol) { @@ -126,6 +125,7 @@ struct shared_policy { spinlock_t lock; }; +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, @@ -240,7 +240,12 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) } #define vma_policy(vma) NULL -#define vma_set_policy(vma, pol) do {} while(0) + +static inline int +vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + return 0; +} static inline void numa_policy_init(void) { diff --git a/kernel/fork.c b/kernel/fork.c index 84703db06cf3..81ccb4f010c2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -351,7 +351,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge; - struct mempolicy *pol; uprobe_start_dup_mmap(); down_write(&oldmm->mmap_sem); @@ -400,11 +399,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) goto fail_nomem; *tmp = *mpnt; INIT_LIST_HEAD(&tmp->anon_vma_chain); - pol = mpol_dup(vma_policy(mpnt)); - retval = PTR_ERR(pol); - if (IS_ERR(pol)) + retval = vma_dup_policy(mpnt, tmp); + if (retval) goto fail_nomem_policy; - vma_set_policy(tmp, pol); tmp->vm_mm = mm; if (anon_vma_fork(tmp, mpnt)) goto fail_nomem_anon_vma_fork; @@ -472,7 +469,7 @@ out: uprobe_end_dup_mmap(); return retval; fail_nomem_anon_vma_fork: - mpol_put(pol); + mpol_put(vma_policy(tmp)); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4baf12e534d1..6b1d426731ae 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2065,6 +2065,16 @@ retry_cpuset: } EXPORT_SYMBOL(alloc_pages_current); +int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) +{ + struct mempolicy *pol = mpol_dup(vma_policy(src)); + + if (IS_ERR(pol)) + return PTR_ERR(pol); + dst->vm_policy = pol; + return 0; +} + /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() diff --git a/mm/mmap.c b/mm/mmap.c index f9c97d10b873..14f6bb4830f7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2380,7 +2380,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { - struct mempolicy *pol; struct vm_area_struct *new; int err = -ENOMEM; @@ -2404,12 +2403,9 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT); } - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - err = PTR_ERR(pol); + err = vma_dup_policy(vma, new); + if (err) goto out_free_vma; - } - vma_set_policy(new, pol); if (anon_vma_clone(new, vma)) goto out_free_mpol; @@ -2437,7 +2433,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, fput(new->vm_file); unlink_anon_vmas(new); out_free_mpol: - mpol_put(pol); + mpol_put(vma_policy(new)); out_free_vma: kmem_cache_free(vm_area_cachep, new); out_err: @@ -2780,7 +2776,6 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; - struct mempolicy *pol; bool faulted_in_anon_vma = true; /* @@ -2825,10 +2820,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, new_vma->vm_start = addr; new_vma->vm_end = addr + len; new_vma->vm_pgoff = pgoff; - pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) + if (vma_dup_policy(vma, new_vma)) goto out_free_vma; - vma_set_policy(new_vma, pol); INIT_LIST_HEAD(&new_vma->anon_vma_chain); if (anon_vma_clone(new_vma, vma)) goto out_free_mempol; @@ -2843,7 +2836,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_free_mempol: - mpol_put(pol); + mpol_put(vma_policy(new_vma)); out_free_vma: kmem_cache_free(vm_area_cachep, new_vma); return NULL; -- cgit v1.3-8-gc7d7 From c33bc315fd921b1179a1d3df5756e0da6fb73944 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:21:44 -0700 Subject: mm: use zone_end_pfn() instead of zone_start_pfn+spanned_pages Use "zone_end_pfn()" instead of "zone->zone_start_pfn + zone->spanned_pages". Simplify the code, no functional change. [akpm@linux-foundation.org: fix build] Signed-off-by: Xishi Qiu Cc: Cody P Schafer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 12 ++++++------ mm/memory_hotplug.c | 7 ++++--- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 349587bb03e1..358a146fd4da 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -352,7 +352,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) struct mem_extent *ext, *cur, *aux; zone_start = zone->zone_start_pfn; - zone_end = zone->zone_start_pfn + zone->spanned_pages; + zone_end = zone_end_pfn(zone); list_for_each_entry(ext, list, hook) if (zone_start <= ext->end) @@ -884,7 +884,7 @@ static unsigned int count_highmem_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_highmem_page(zone, pfn)) n++; @@ -948,7 +948,7 @@ static unsigned int count_data_pages(void) continue; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (saveable_page(zone, pfn)) n++; @@ -1041,7 +1041,7 @@ copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm) unsigned long max_zone_pfn; mark_free_pages(zone); - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (page_is_saveable(zone, pfn)) memory_bm_set_bit(orig_bm, pfn); @@ -1093,7 +1093,7 @@ void swsusp_free(void) unsigned long pfn, max_zone_pfn; for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); @@ -1755,7 +1755,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) /* Clear page flags */ for_each_populated_zone(zone) { - max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; + max_zone_pfn = zone_end_pfn(zone); for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) if (pfn_valid(pfn)) swsusp_unset_page_free(pfn_to_page(pfn)); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8e333f953f08..9eadad626d64 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -229,7 +229,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); - old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + old_zone_end_pfn = zone_end_pfn(zone); if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; @@ -514,8 +514,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone, static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long zone_start_pfn = zone->zone_start_pfn; - unsigned long zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + unsigned long zone_start_pfn = zone->zone_start_pfn; + unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */ + unsigned long zone_end_pfn = z; unsigned long pfn; struct mem_section *ms; int nid = zone_to_nid(zone); -- cgit v1.3-8-gc7d7 From 86cdb465cf3a9d81058b517af05074157fa9dcdd Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Sep 2013 14:22:13 -0700 Subject: mm: prepare to remove /proc/sys/vm/hugepages_treat_as_movable Now hugepage migration is enabled, although restricted on pmd-based hugepages for now (due to lack of testing.) So we should allocate migratable hugepages from ZONE_MOVABLE if possible. This patch makes GFP flags in hugepage allocation dependent on migration support, not only the value of hugepages_treat_as_movable. It provides no change on the behavior for architectures which do not support hugepage migration, Signed-off-by: Naoya Horiguchi Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Cc: Hillf Danton Cc: Mel Gorman Cc: Hugh Dickins Cc: KOSAKI Motohiro Cc: Michal Hocko Cc: Rik van Riel Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 30 +++++++++++++++++++----------- kernel/sysctl.c | 2 +- mm/hugetlb.c | 32 ++++++++++++++------------------ 3 files changed, 34 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 36ecc26c7433..79a797eb3e87 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -200,17 +200,25 @@ fragmentation index is <= extfrag_threshold. The default value is 500. hugepages_treat_as_movable -This parameter is only useful when kernelcore= is specified at boot time to -create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages -are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero -value written to hugepages_treat_as_movable allows huge pages to be allocated -from ZONE_MOVABLE. - -Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge -pages pool can easily grow or shrink within. Assuming that applications are -not running that mlock() a lot of memory, it is likely the huge pages pool -can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value -into nr_hugepages and triggering page reclaim. +This parameter controls whether we can allocate hugepages from ZONE_MOVABLE +or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. +ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, +so this parameter has no effect if used without kernelcore=. + +Hugepage migration is now available in some situations which depend on the +architecture and/or the hugepage size. If a hugepage supports migration, +allocation from ZONE_MOVABLE is always enabled for the hugepage regardless +of the value of this parameter. +IOW, this parameter affects only non-migratable hugepages. + +Assuming that hugepages are not migratable in your system, one usecase of +this parameter is that users can make hugepage pool more extensible by +enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE +page reclaim/migration/compaction work more and you can get contiguous +memory more likely. Note that using ZONE_MOVABLE for non-migratable +hugepages can do harm to other features like memory hotremove (because +memory hotremove expects that memory blocks on ZONE_MOVABLE are always +removable,) so it's a trade-off responsible for the users. ============================================================== diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 07f6fc468e17..dc69093a8ec4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1225,7 +1225,7 @@ static struct ctl_table vm_table[] = { .data = &hugepages_treat_as_movable, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = hugetlb_treat_movable_handler, + .proc_handler = proc_dointvec, }, { .procname = "nr_overcommit_hugepages", diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fb4293b93fd0..b49579c7f2a5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,7 +34,6 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; -static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; int hugetlb_max_hstate __read_mostly; @@ -539,6 +538,15 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return page; } +/* Movability of hugepages depends on migration support. */ +static inline gfp_t htlb_alloc_mask(struct hstate *h) +{ + if (hugepages_treat_as_movable || hugepage_migration_support(h)) + return GFP_HIGHUSER_MOVABLE; + else + return GFP_HIGHUSER; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -568,11 +576,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, retry_cpuset: cpuset_mems_cookie = get_mems_allowed(); zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + htlb_alloc_mask(h), &mpol, &nodemask); for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { page = dequeue_huge_page_node(h, zone_to_nid(zone)); if (page) { if (avoid_reserve) @@ -738,7 +746,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return NULL; page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { @@ -965,12 +973,12 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (nid == NUMA_NO_NODE) - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + page = alloc_pages(htlb_alloc_mask(h)|__GFP_COMP| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); else page = alloc_pages_exact_node(nid, - htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { @@ -2117,18 +2125,6 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, } #endif /* CONFIG_NUMA */ -int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - proc_dointvec(table, write, buffer, length, ppos); - if (hugepages_treat_as_movable) - htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; - else - htlb_alloc_mask = GFP_HIGHUSER; - return 0; -} - int hugetlb_overcommit_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) -- cgit v1.3-8-gc7d7 From 3ddc5b46a8e90f3c9251338b60191d0a804b0d92 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Wed, 11 Sep 2013 14:23:18 -0700 Subject: kernel-wide: fix missing validations on __get/__put/__copy_to/__copy_from_user() I found the following pattern that leads in to interesting findings: grep -r "ret.*|=.*__put_user" * grep -r "ret.*|=.*__get_user" * grep -r "ret.*|=.*__copy" * The __put_user() calls in compat_ioctl.c, ptrace compat, signal compat, since those appear in compat code, we could probably expect the kernel addresses not to be reachable in the lower 32-bit range, so I think they might not be exploitable. For the "__get_user" cases, I don't think those are exploitable: the worse that can happen is that the kernel will copy kernel memory into in-kernel buffers, and will fail immediately afterward. The alpha csum_partial_copy_from_user() seems to be missing the access_ok() check entirely. The fix is inspired from x86. This could lead to information leak on alpha. I also noticed that many architectures map csum_partial_copy_from_user() to csum_partial_copy_generic(), but I wonder if the latter is performing the access checks on every architectures. Signed-off-by: Mathieu Desnoyers Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Jens Axboe Cc: Oleg Nesterov Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/alpha/lib/csum_partial_copy.c | 5 ++++ arch/sparc/kernel/sys_sparc32.c | 12 ++++----- block/compat_ioctl.c | 2 +- kernel/signal.c | 4 +-- net/socket.c | 50 +++++++++++++++++++------------------- 5 files changed, 39 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/lib/csum_partial_copy.c b/arch/alpha/lib/csum_partial_copy.c index 40736da9bea8..ffb19b7da999 100644 --- a/arch/alpha/lib/csum_partial_copy.c +++ b/arch/alpha/lib/csum_partial_copy.c @@ -338,6 +338,11 @@ csum_partial_copy_from_user(const void __user *src, void *dst, int len, unsigned long doff = 7 & (unsigned long) dst; if (len) { + if (!access_ok(VERIFY_READ, src, len)) { + *errp = -EFAULT; + memset(dst, 0, len); + return sum; + } if (!doff) { if (!soff) checksum = csum_partial_cfu_aligned( diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index 3d0ddbc005fe..71368850dfc0 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -169,10 +169,10 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig, new_ka.ka_restorer = restorer; ret = get_user(u_handler, &act->sa_handler); new_ka.sa.sa_handler = compat_ptr(u_handler); - ret |= __copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); + ret |= copy_from_user(&set32, &act->sa_mask, sizeof(compat_sigset_t)); sigset_from_compat(&new_ka.sa.sa_mask, &set32); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); - ret |= __get_user(u_restorer, &act->sa_restorer); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(u_restorer, &act->sa_restorer); new_ka.sa.sa_restorer = compat_ptr(u_restorer); if (ret) return -EFAULT; @@ -183,9 +183,9 @@ COMPAT_SYSCALL_DEFINE5(rt_sigaction, int, sig, if (!ret && oact) { sigset_to_compat(&set32, &old_ka.sa.sa_mask); ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); - ret |= __copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); - ret |= __put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); + ret |= copy_to_user(&oact->sa_mask, &set32, sizeof(compat_sigset_t)); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); if (ret) ret = -EFAULT; } diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 7e5d474dc6ba..fbd5a67cb773 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -70,7 +70,7 @@ static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev, return ret; ret = copy_to_user(ugeo, &geo, 4); - ret |= __put_user(geo.start, &ugeo->start); + ret |= put_user(geo.start, &ugeo->start); if (ret) ret = -EFAULT; diff --git a/kernel/signal.c b/kernel/signal.c index 50e41075ac77..ded28b91fa53 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3394,7 +3394,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, new_ka.sa.sa_restorer = compat_ptr(restorer); #endif ret |= copy_from_user(&mask, &act->sa_mask, sizeof(mask)); - ret |= __get_user(new_ka.sa.sa_flags, &act->sa_flags); + ret |= get_user(new_ka.sa.sa_flags, &act->sa_flags); if (ret) return -EFAULT; sigset_from_compat(&new_ka.sa.sa_mask, &mask); @@ -3406,7 +3406,7 @@ COMPAT_SYSCALL_DEFINE4(rt_sigaction, int, sig, ret = put_user(ptr_to_compat(old_ka.sa.sa_handler), &oact->sa_handler); ret |= copy_to_user(&oact->sa_mask, &mask, sizeof(mask)); - ret |= __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + ret |= put_user(old_ka.sa.sa_flags, &oact->sa_flags); #ifdef __ARCH_HAS_SA_RESTORER ret |= put_user(ptr_to_compat(old_ka.sa.sa_restorer), &oact->sa_restorer); diff --git a/net/socket.c b/net/socket.c index b2d7c629eeb9..0ceaa5cb9ead 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3072,12 +3072,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, uifmap32 = &uifr32->ifr_ifru.ifru_map; err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name)); - err |= __get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= __get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= __get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= __get_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= __get_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= __get_user(ifr.ifr_map.port, &uifmap32->port); + err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= get_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= get_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= get_user(ifr.ifr_map.port, &uifmap32->port); if (err) return -EFAULT; @@ -3088,12 +3088,12 @@ static int compat_sioc_ifmap(struct net *net, unsigned int cmd, if (cmd == SIOCGIFMAP && !err) { err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name)); - err |= __put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); - err |= __put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); - err |= __put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); - err |= __put_user(ifr.ifr_map.irq, &uifmap32->irq); - err |= __put_user(ifr.ifr_map.dma, &uifmap32->dma); - err |= __put_user(ifr.ifr_map.port, &uifmap32->port); + err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start); + err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end); + err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr); + err |= put_user(ifr.ifr_map.irq, &uifmap32->irq); + err |= put_user(ifr.ifr_map.dma, &uifmap32->dma); + err |= put_user(ifr.ifr_map.port, &uifmap32->port); if (err) err = -EFAULT; } @@ -3167,25 +3167,25 @@ static int routing_ioctl(struct net *net, struct socket *sock, struct in6_rtmsg32 __user *ur6 = argp; ret = copy_from_user(&r6.rtmsg_dst, &(ur6->rtmsg_dst), 3 * sizeof(struct in6_addr)); - ret |= __get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); - ret |= __get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); - ret |= __get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); - ret |= __get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); - ret |= __get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); - ret |= __get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); - ret |= __get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); + ret |= get_user(r6.rtmsg_type, &(ur6->rtmsg_type)); + ret |= get_user(r6.rtmsg_dst_len, &(ur6->rtmsg_dst_len)); + ret |= get_user(r6.rtmsg_src_len, &(ur6->rtmsg_src_len)); + ret |= get_user(r6.rtmsg_metric, &(ur6->rtmsg_metric)); + ret |= get_user(r6.rtmsg_info, &(ur6->rtmsg_info)); + ret |= get_user(r6.rtmsg_flags, &(ur6->rtmsg_flags)); + ret |= get_user(r6.rtmsg_ifindex, &(ur6->rtmsg_ifindex)); r = (void *) &r6; } else { /* ipv4 */ struct rtentry32 __user *ur4 = argp; ret = copy_from_user(&r4.rt_dst, &(ur4->rt_dst), 3 * sizeof(struct sockaddr)); - ret |= __get_user(r4.rt_flags, &(ur4->rt_flags)); - ret |= __get_user(r4.rt_metric, &(ur4->rt_metric)); - ret |= __get_user(r4.rt_mtu, &(ur4->rt_mtu)); - ret |= __get_user(r4.rt_window, &(ur4->rt_window)); - ret |= __get_user(r4.rt_irtt, &(ur4->rt_irtt)); - ret |= __get_user(rtdev, &(ur4->rt_dev)); + ret |= get_user(r4.rt_flags, &(ur4->rt_flags)); + ret |= get_user(r4.rt_metric, &(ur4->rt_metric)); + ret |= get_user(r4.rt_mtu, &(ur4->rt_mtu)); + ret |= get_user(r4.rt_window, &(ur4->rt_window)); + ret |= get_user(r4.rt_irtt, &(ur4->rt_irtt)); + ret |= get_user(rtdev, &(ur4->rt_dev)); if (rtdev) { ret |= copy_from_user(devname, compat_ptr(rtdev), 15); r4.rt_dev = (char __user __force *)devname; -- cgit v1.3-8-gc7d7 From 54a33b1b1470ada14fa2998e8b48ad4a0ef6a916 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 11 Sep 2013 14:23:19 -0700 Subject: kernel/modsign_pubkey.c: fix init const for module signing code const has to use __initconst, not __initdata Signed-off-by: Andi Kleen Acked-by: David Howells Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/modsign_pubkey.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/modsign_pubkey.c b/kernel/modsign_pubkey.c index 2b6e69909c39..7cbd4507a7e6 100644 --- a/kernel/modsign_pubkey.c +++ b/kernel/modsign_pubkey.c @@ -18,14 +18,14 @@ struct key *modsign_keyring; -extern __initdata const u8 modsign_certificate_list[]; -extern __initdata const u8 modsign_certificate_list_end[]; +extern __initconst const u8 modsign_certificate_list[]; +extern __initconst const u8 modsign_certificate_list_end[]; /* * We need to make sure ccache doesn't cache the .o file as it doesn't notice * if modsign.pub changes. */ -static __initdata const char annoy_ccache[] = __TIME__ "foo"; +static __initconst const char annoy_ccache[] = __TIME__ "foo"; /* * Load the compiled-in keys -- cgit v1.3-8-gc7d7 From 60c323699bb308404dcb60e8808531e02651578a Mon Sep 17 00:00:00 2001 From: Chen Gang Date: Wed, 11 Sep 2013 14:23:22 -0700 Subject: kernel/smp.c: free related resources when failure occurs in hotplug_cfd() When failure occurs in hotplug_cfd(), need release related resources, or will cause memory leak. Signed-off-by: Chen Gang Acked-by: Wang YanQing Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 449b707fc20d..3bb6ae533cdf 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -48,10 +48,13 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) cpu_to_node(cpu))) return notifier_from_errno(-ENOMEM); if (!zalloc_cpumask_var_node(&cfd->cpumask_ipi, GFP_KERNEL, - cpu_to_node(cpu))) + cpu_to_node(cpu))) { + free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); + } cfd->csd = alloc_percpu(struct call_single_data); if (!cfd->csd) { + free_cpumask_var(cfd->cpumask_ipi); free_cpumask_var(cfd->cpumask); return notifier_from_errno(-ENOMEM); } -- cgit v1.3-8-gc7d7 From c14c338cb05c700a260480c197cfd6da8f8b7d2e Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 11 Sep 2013 14:23:23 -0700 Subject: kernel/spinlock.c: add default arch_*_relax definitions for GENERIC_LOCKBREAK When running with GENERIC_LOCKBREAK=y, the locking implementations emit calls to arch_{read,write,spin}_relax when spinning on a contended lock in order to allow architectures to favour the CPU owning the lock if possible. In reality, everybody apart from PowerPC and S390 just does cpu_relax() here, so make that the default behaviour and allow it to be overridden if required. Signed-off-by: Will Deacon Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/spinlock.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/spinlock.c b/kernel/spinlock.c index 5cdd8065a3ce..4b082b5cac9e 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c @@ -34,6 +34,20 @@ #else #define raw_read_can_lock(l) read_can_lock(l) #define raw_write_can_lock(l) write_can_lock(l) + +/* + * Some architectures can relax in favour of the CPU owning the lock. + */ +#ifndef arch_read_relax +# define arch_read_relax(l) cpu_relax() +#endif +#ifndef arch_write_relax +# define arch_write_relax(l) cpu_relax() +#endif +#ifndef arch_spin_relax +# define arch_spin_relax(l) cpu_relax() +#endif + /* * We build the __lock_function inlines here. They are too large for * inlining all over the place, but here is only one user per function -- cgit v1.3-8-gc7d7 From fa688207c9db48b64ab6538abc3fcdf26110b9ec Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:24 -0700 Subject: smp: quit unconditionally enabling irq in on_each_cpu_mask and on_each_cpu_cond As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in !SMP version of on_each_cpu()"), we don't want to enable irqs if they are not already enabled. There are currently no known problematical callers of these functions, but since it is a known failure pattern, we preemptively fix them. Since they are not trivial functions, make them non-inline by moving them to up.c. This also makes it so we don't have to fix #include dependancies for preempt_{disable,enable}. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 62 ++++++++++++++--------------------------------------- kernel/up.c | 39 +++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index c8488763277f..3724a9070907 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -29,6 +29,22 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on processors specified by mask, which might include + * the local one. + */ +void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, + void *info, bool wait); + +/* + * Call a function on each processor for which the supplied function + * cond_func returns a positive value. This may include the local + * processor. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags); + #ifdef CONFIG_SMP #include @@ -100,22 +116,6 @@ static inline void call_function_init(void) { } */ int on_each_cpu(smp_call_func_t func, void *info, int wait); -/* - * Call a function on processors specified by mask, which might include - * the local one. - */ -void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, - void *info, bool wait); - -/* - * Call a function on each processor for which the supplied function - * cond_func returns a positive value. This may include the local - * processor. - */ -void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), - smp_call_func_t func, void *info, bool wait, - gfp_t gfp_flags); - /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -151,36 +151,6 @@ static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) return 0; } -/* - * Note we still need to test the mask even for UP - * because we actually can get an empty mask from - * code that on SMP might call us without the local - * CPU in the mask. - */ -#define on_each_cpu_mask(mask, func, info, wait) \ - do { \ - if (cpumask_test_cpu(0, (mask))) { \ - local_irq_disable(); \ - (func)(info); \ - local_irq_enable(); \ - } \ - } while (0) -/* - * Preemption is disabled here to make sure the cond_func is called under the - * same condtions in UP and SMP. - */ -#define on_each_cpu_cond(cond_func, func, info, wait, gfp_flags)\ - do { \ - void *__info = (info); \ - preempt_disable(); \ - if ((cond_func)(0, __info)) { \ - local_irq_disable(); \ - (func)(__info); \ - local_irq_enable(); \ - } \ - preempt_enable(); \ - } while (0) - static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/up.c b/kernel/up.c index c54c75e9faf7..144e57255234 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -19,3 +19,42 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, return 0; } EXPORT_SYMBOL(smp_call_function_single); + +/* + * Note we still need to test the mask even for UP + * because we actually can get an empty mask from + * code that on SMP might call us without the local + * CPU in the mask. + */ +void on_each_cpu_mask(const struct cpumask *mask, + smp_call_func_t func, void *info, bool wait) +{ + unsigned long flags; + + if (cpumask_test_cpu(0, mask)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } +} +EXPORT_SYMBOL(on_each_cpu_mask); + +/* + * Preemption is disabled here to make sure the cond_func is called under the + * same condtions in UP and SMP. + */ +void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), + smp_call_func_t func, void *info, bool wait, + gfp_t gfp_flags) +{ + unsigned long flags; + + preempt_disable(); + if (cond_func(0, info)) { + local_irq_save(flags); + func(info); + local_irq_restore(flags); + } + preempt_enable(); +} +EXPORT_SYMBOL(on_each_cpu_cond); -- cgit v1.3-8-gc7d7 From 081192b25c2d4620b5f5838620624d3daee94b66 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:25 -0700 Subject: up.c: use local_irq_{save,restore}() in smp_call_function_single. The SMP version of this function doesn't unconditionally enable irqs, so neither should this !SMP version. There are no know problems caused by this, but we make the change for consistency's sake. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/up.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/up.c b/kernel/up.c index 144e57255234..b1cf036255f3 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -10,11 +10,13 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, int wait) { + unsigned long flags; + WARN_ON(cpu != 0); - local_irq_disable(); - (func)(info); - local_irq_enable(); + local_irq_save(flags); + func(info); + local_irq_restore(flags); return 0; } -- cgit v1.3-8-gc7d7 From bff2dc42bcafdd75c0296987747f782965d691a0 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:26 -0700 Subject: smp.h: move !SMP version of on_each_cpu() out-of-line All of the other non-trivial !SMP versions of functions in smp.h are out-of-line in up.c. Move on_each_cpu() there as well. This allows us to get rid of the #include . The drawback is that this makes both the x86_64 and i386 defconfig !SMP kernels about 200 bytes larger each. Signed-off-by: David Daney Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 21 +++++---------------- kernel/up.c | 11 +++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index 3724a9070907..cfb7ca094b38 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -11,7 +11,6 @@ #include #include #include -#include extern void cpu_idle(void); @@ -29,6 +28,11 @@ extern unsigned int total_cpus; int smp_call_function_single(int cpuid, smp_call_func_t func, void *info, int wait); +/* + * Call a function on all processors + */ +int on_each_cpu(smp_call_func_t func, void *info, int wait); + /* * Call a function on processors specified by mask, which might include * the local one. @@ -111,11 +115,6 @@ void generic_smp_call_function_single_interrupt(void); static inline void call_function_init(void) { } #endif -/* - * Call a function on all processors - */ -int on_each_cpu(smp_call_func_t func, void *info, int wait); - /* * Mark the boot cpu "online" so that it can call console drivers in * printk() and can access its per-cpu storage. @@ -141,16 +140,6 @@ static inline int up_smp_call_function(smp_call_func_t func, void *info) #define smp_call_function(func, info, wait) \ (up_smp_call_function(func, info)) -static inline int on_each_cpu(smp_call_func_t func, void *info, int wait) -{ - unsigned long flags; - - local_irq_save(flags); - func(info); - local_irq_restore(flags); - return 0; -} - static inline void smp_send_reschedule(int cpu) { } #define smp_prepare_boot_cpu() do {} while (0) #define smp_call_function_many(mask, func, info, wait) \ diff --git a/kernel/up.c b/kernel/up.c index b1cf036255f3..630d72bf7e41 100644 --- a/kernel/up.c +++ b/kernel/up.c @@ -22,6 +22,17 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info, } EXPORT_SYMBOL(smp_call_function_single); +int on_each_cpu(smp_call_func_t func, void *info, int wait) +{ + unsigned long flags; + + local_irq_save(flags); + func(info); + local_irq_restore(flags); + return 0; +} +EXPORT_SYMBOL(on_each_cpu); + /* * Note we still need to test the mask even for UP * because we actually can get an empty mask from -- cgit v1.3-8-gc7d7 From e656a634118285142063527b2cd40c749036de82 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Wed, 11 Sep 2013 14:23:27 -0700 Subject: extable: skip sorting if the table is empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least on ARM no-MMU the extable is empty and so there is nothing to sort. So add a check for the table to be empty which effectively only changes that the misleading pr_notice is suppressed. Signed-off-by: Uwe Kleine-König Cc: Ingo Molnar Cc: David Daney Cc: "H. Peter Anvin" Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 67460b93b1a1..832cb28105bb 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -41,7 +41,7 @@ u32 __initdata main_extable_sort_needed = 1; /* Sort the kernel's built-in exception table */ void __init sort_main_extable(void) { - if (main_extable_sort_needed) { + if (main_extable_sort_needed && __stop___ex_table > __start___ex_table) { pr_notice("Sorting __ex_table...\n"); sort_extable(__start___ex_table, __stop___ex_table); } -- cgit v1.3-8-gc7d7 From 202da400570d991bacda4a06e878cb901e96a783 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 11 Sep 2013 14:23:29 -0700 Subject: kernel/smp.c: quit unconditionally enabling irqs in on_each_cpu_mask(). As in commit f21afc25f9ed ("smp.h: Use local_irq_{save,restore}() in !SMP version of on_each_cpu()"), we don't want to enable irqs if they are not already enabled. I don't know of any bugs currently caused by this unconditional local_irq_enable(), but I want to use this function in MIPS/OCTEON early boot (when we have early_boot_irqs_disabled). This also makes this function have similar semantics to on_each_cpu() which is good in itself. Signed-off-by: David Daney Cc: Gilad Ben-Yossef Cc: Christoph Lameter Cc: Chris Metcalf Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/smp.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/smp.c b/kernel/smp.c index 3bb6ae533cdf..0564571dcdf7 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -575,8 +575,10 @@ EXPORT_SYMBOL(on_each_cpu); * * If @wait is true, then returns once @func has returned. * - * You must not call this function with disabled interrupts or - * from a hardware interrupt handler or from a bottom half handler. + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. The + * exception is that it may be used during early boot while + * early_boot_irqs_disabled is set. */ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait) @@ -585,9 +587,10 @@ void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func, smp_call_function_many(mask, func, info, wait); if (cpumask_test_cpu(cpu, mask)) { - local_irq_disable(); + unsigned long flags; + local_irq_save(flags); func(info); - local_irq_enable(); + local_irq_restore(flags); } put_cpu(); } -- cgit v1.3-8-gc7d7 From 205e550a0fb469ae73f91a903f27f4f63e774037 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:23:30 -0700 Subject: task_work: minor cleanups Trivial. Remove the unnecessary "work = NULL" initialization and turn read_barrier_depends() into smp_read_barrier_depends() in task_work_cancel(). Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/task_work.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 65bd3c92d6f3..6ee09856f725 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -25,7 +25,7 @@ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { struct callback_head **pprev = &task->task_works; - struct callback_head *work = NULL; + struct callback_head *work; unsigned long flags; /* * If cmpxchg() fails we continue without updating pprev. @@ -35,7 +35,7 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) */ raw_spin_lock_irqsave(&task->pi_lock, flags); while ((work = ACCESS_ONCE(*pprev))) { - read_barrier_depends(); + smp_read_barrier_depends(); if (work->func != func) pprev = &work->next; else if (cmpxchg(pprev, work, work->next) == work) -- cgit v1.3-8-gc7d7 From 892f6668f3a7088c7e30049c3d8e1844531602dc Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 14:23:31 -0700 Subject: task_work: documentation No functional changes, just comments. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/task_work.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'kernel') diff --git a/kernel/task_work.c b/kernel/task_work.c index 6ee09856f725..8727032e3a6f 100644 --- a/kernel/task_work.c +++ b/kernel/task_work.c @@ -4,6 +4,23 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */ +/** + * task_work_add - ask the @task to execute @work->func() + * @task: the task which should run the callback + * @work: the callback to run + * @notify: send the notification if true + * + * Queue @work for task_work_run() below and notify the @task if @notify. + * Fails if the @task is exiting/exited and thus it can't process this @work. + * Otherwise @work->func() will be called when the @task returns from kernel + * mode or exits. + * + * This is like the signal handler which runs in kernel mode, but it doesn't + * try to wake up the @task. + * + * RETURNS: + * 0 if succeeds or -ESRCH. + */ int task_work_add(struct task_struct *task, struct callback_head *work, bool notify) { @@ -21,6 +38,17 @@ task_work_add(struct task_struct *task, struct callback_head *work, bool notify) return 0; } +/** + * task_work_cancel - cancel a pending work added by task_work_add() + * @task: the task which should execute the work + * @func: identifies the work to remove + * + * Find the last queued pending work with ->func == @func and remove + * it from queue. + * + * RETURNS: + * The found work or NULL if not found. + */ struct callback_head * task_work_cancel(struct task_struct *task, task_work_func_t func) { @@ -46,6 +74,14 @@ task_work_cancel(struct task_struct *task, task_work_func_t func) return work; } +/** + * task_work_run - execute the works added by task_work_add() + * + * Flush the pending works. Should be used by the core kernel code. + * Called before the task returns to the user-mode or stops, or when + * it exits. In the latter case task_work_add() can no longer add the + * new work after task_work_run() returns. + */ void task_work_run(void) { struct task_struct *task = current; -- cgit v1.3-8-gc7d7 From c802d64a356b5cf349121ac4c5e005f037ce548d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:11 -0700 Subject: kprobes: unify insn caches The current kpropes insn caches allocate memory areas for insn slots with module_alloc(). The assumption is that the kernel image and module area are both within the same +/- 2GB memory area. This however is not true for s390 where the kernel image resides within the first 2GB (DMA memory area), but the module area is far away in the vmalloc area, usually somewhere close below the 4TB area. For new pc relative instructions s390 needs insn slots that are within +/- 2GB of each area. That way we can patch displacements of pc-relative instructions within the insn slots just like x86 and powerpc. The module area works already with the normal insn slot allocator, however there is currently no way to get insn slots that are within the first 2GB on s390 (aka DMA area). Therefore this patch set modifies the kprobes insn slot cache code in order to allow to specify a custom allocator for the insn slot cache pages. In addition architecure can now have private insn slot caches withhout the need to modify common code. Patch 1 unifies and simplifies the current insn and optinsn caches implementation. This is a preparation which allows to add more insn caches in a simple way. Patch 2 adds the possibility to specify a custom allocator. Patch 3 makes s390 use the new insn slot mechanisms and adds support for pc-relative instructions with long displacements. This patch (of 3): The two insn caches (insn, and optinsn) each have an own mutex and alloc/free functions (get_[opt]insn_slot() / free_[opt]insn_slot()). Since there is the need for yet another insn cache which satifies dma allocations on s390, unify and simplify the current implementation: - Move the per insn cache mutex into struct kprobe_insn_cache. - Move the alloc/free functions to kprobe.h so they are simply wrappers for the generic __get_insn_slot/__free_insn_slot functions. The implementation is done with a DEFINE_INSN_CACHE_OPS() macro which provides the alloc/free functions for each cache if needed. - move the struct kprobe_insn_cache to kprobe.h which allows to generate architecture specific insn slot caches outside of the core kprobes code. Signed-off-by: Heiko Carstens Cc: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 32 ++++++++++++++++++--- kernel/kprobes.c | 75 ++++++++++++++----------------------------------- 2 files changed, 49 insertions(+), 58 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index ca1d27a0d6a6..077f65321b5e 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -264,10 +264,34 @@ extern void arch_arm_kprobe(struct kprobe *p); extern void arch_disarm_kprobe(struct kprobe *p); extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); -extern kprobe_opcode_t *get_insn_slot(void); -extern void free_insn_slot(kprobe_opcode_t *slot, int dirty); extern void kprobes_inc_nmissed_count(struct kprobe *p); +struct kprobe_insn_cache { + struct mutex mutex; + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; +}; + +extern kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c); +extern void __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty); + +#define DEFINE_INSN_CACHE_OPS(__name) \ +extern struct kprobe_insn_cache kprobe_##__name##_slots; \ + \ +static inline kprobe_opcode_t *get_##__name##_slot(void) \ +{ \ + return __get_insn_slot(&kprobe_##__name##_slots); \ +} \ + \ +static inline void free_##__name##_slot(kprobe_opcode_t *slot, int dirty)\ +{ \ + __free_insn_slot(&kprobe_##__name##_slots, slot, dirty); \ +} \ + +DEFINE_INSN_CACHE_OPS(insn); + #ifdef CONFIG_OPTPROBES /* * Internal structure for direct jump optimized probe @@ -287,13 +311,13 @@ extern void arch_optimize_kprobes(struct list_head *oplist); extern void arch_unoptimize_kprobes(struct list_head *oplist, struct list_head *done_list); extern void arch_unoptimize_kprobe(struct optimized_kprobe *op); -extern kprobe_opcode_t *get_optinsn_slot(void); -extern void free_optinsn_slot(kprobe_opcode_t *slot, int dirty); extern int arch_within_optimized_kprobe(struct optimized_kprobe *op, unsigned long addr); extern void opt_pre_handler(struct kprobe *p, struct pt_regs *regs); +DEFINE_INSN_CACHE_OPS(optinsn); + #ifdef CONFIG_SYSCTL extern int sysctl_kprobes_optimization; extern int proc_kprobes_optimization_handler(struct ctl_table *table, diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6e33498d665c..9e4912dc5559 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -121,12 +121,6 @@ struct kprobe_insn_page { (offsetof(struct kprobe_insn_page, slot_used) + \ (sizeof(char) * (slots))) -struct kprobe_insn_cache { - struct list_head pages; /* list of kprobe_insn_page */ - size_t insn_size; /* size of instruction slot */ - int nr_garbage; -}; - static int slots_per_page(struct kprobe_insn_cache *c) { return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); @@ -138,8 +132,8 @@ enum kprobe_slot_state { SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ -static struct kprobe_insn_cache kprobe_insn_slots = { +struct kprobe_insn_cache kprobe_insn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -150,10 +144,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) +kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; + kprobe_opcode_t *slot = NULL; + mutex_lock(&c->mutex); retry: list_for_each_entry(kip, &c->pages, list) { if (kip->nused < slots_per_page(c)) { @@ -162,7 +158,8 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * c->insn_size); + slot = kip->insns + (i * c->insn_size); + goto out; } } /* kip->nused is broken. Fix it. */ @@ -178,7 +175,7 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) /* All out of space. Need to allocate a new page. */ kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) - return NULL; + goto out; /* * Use module_alloc so this page is within +/- 2GB of where the @@ -188,7 +185,7 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) kip->insns = module_alloc(PAGE_SIZE); if (!kip->insns) { kfree(kip); - return NULL; + goto out; } INIT_LIST_HEAD(&kip->list); memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); @@ -196,19 +193,10 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) kip->nused = 1; kip->ngarbage = 0; list_add(&kip->list, &c->pages); - return kip->insns; -} - - -kprobe_opcode_t __kprobes *get_insn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(&kprobe_insn_slots); - mutex_unlock(&kprobe_insn_mutex); - - return ret; + slot = kip->insns; +out: + mutex_unlock(&c->mutex); + return slot; } /* Return 1 if all garbages are collected, otherwise 0. */ @@ -255,11 +243,12 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) return 0; } -static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, - kprobe_opcode_t *slot, int dirty) +void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; + mutex_lock(&c->mutex); list_for_each_entry(kip, &c->pages, list) { long idx = ((long)slot - (long)kip->insns) / (c->insn_size * sizeof(kprobe_opcode_t)); @@ -272,45 +261,23 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, collect_garbage_slots(c); } else collect_one_slot(kip, idx); - return; + goto out; } } /* Could not free this slot. */ WARN_ON(1); +out: + mutex_unlock(&c->mutex); } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_insn_mutex); - __free_insn_slot(&kprobe_insn_slots, slot, dirty); - mutex_unlock(&kprobe_insn_mutex); -} #ifdef CONFIG_OPTPROBES /* For optimized_kprobe buffer */ -static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ -static struct kprobe_insn_cache kprobe_optinsn_slots = { +struct kprobe_insn_cache kprobe_optinsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, }; -/* Get a slot for optimized_kprobe buffer */ -kprobe_opcode_t __kprobes *get_optinsn_slot(void) -{ - kprobe_opcode_t *ret = NULL; - - mutex_lock(&kprobe_optinsn_mutex); - ret = __get_insn_slot(&kprobe_optinsn_slots); - mutex_unlock(&kprobe_optinsn_mutex); - - return ret; -} - -void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) -{ - mutex_lock(&kprobe_optinsn_mutex); - __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); - mutex_unlock(&kprobe_optinsn_mutex); -} #endif #endif -- cgit v1.3-8-gc7d7 From af96397de8600232effbff43dc8b4ca20ddc02b1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 11 Sep 2013 14:24:13 -0700 Subject: kprobes: allow to specify custom allocator for insn caches The current two insn slot caches both use module_alloc/module_free to allocate and free insn slot cache pages. For s390 this is not sufficient since there is the need to allocate insn slots that are either within the vmalloc module area or within dma memory. Therefore add a mechanism which allows to specify an own allocator for an own insn slot cache. Signed-off-by: Heiko Carstens Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Ingo Molnar Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kprobes.h | 2 ++ kernel/kprobes.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index 077f65321b5e..925eaf28fca9 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -268,6 +268,8 @@ extern void kprobes_inc_nmissed_count(struct kprobe *p); struct kprobe_insn_cache { struct mutex mutex; + void *(*alloc)(void); /* allocate insn page */ + void (*free)(void *); /* free insn page */ struct list_head pages; /* list of kprobe_insn_page */ size_t insn_size; /* size of instruction slot */ int nr_garbage; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 9e4912dc5559..a0d367a49122 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -112,6 +112,7 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ + struct kprobe_insn_cache *cache; int nused; int ngarbage; char slot_used[]; @@ -132,8 +133,20 @@ enum kprobe_slot_state { SLOT_USED = 2, }; +static void *alloc_insn_page(void) +{ + return module_alloc(PAGE_SIZE); +} + +static void free_insn_page(void *page) +{ + module_free(NULL, page); +} + struct kprobe_insn_cache kprobe_insn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_insn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), .insn_size = MAX_INSN_SIZE, .nr_garbage = 0, @@ -182,7 +195,7 @@ kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) * kernel image and loaded module images reside. This is required * so x86_64 can correctly handle the %rip-relative fixups. */ - kip->insns = module_alloc(PAGE_SIZE); + kip->insns = c->alloc(); if (!kip->insns) { kfree(kip); goto out; @@ -192,6 +205,7 @@ kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + kip->cache = c; list_add(&kip->list, &c->pages); slot = kip->insns; out: @@ -213,7 +227,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) */ if (!list_is_singular(&kip->list)) { list_del(&kip->list); - module_free(NULL, kip->insns); + kip->cache->free(kip->insns); kfree(kip); } return 1; @@ -274,6 +288,8 @@ out: /* For optimized_kprobe buffer */ struct kprobe_insn_cache kprobe_optinsn_slots = { .mutex = __MUTEX_INITIALIZER(kprobe_optinsn_slots.mutex), + .alloc = alloc_insn_page, + .free = free_insn_page, .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), /* .insn_size is initialized later */ .nr_garbage = 0, -- cgit v1.3-8-gc7d7 From 73af963f9f3036dffed55c3a2898598186db1045 Mon Sep 17 00:00:00 2001 From: Mark Grondona Date: Wed, 11 Sep 2013 14:24:31 -0700 Subject: __ptrace_may_access() should not deny sub-threads __ptrace_may_access() checks get_dumpable/ptrace_has_cap/etc if task != current, this can can lead to surprising results. For example, a sub-thread can't readlink("/proc/self/exe") if the executable is not readable. setup_new_exec()->would_dump() notices that inode_permission(MAY_READ) fails and then it does set_dumpable(suid_dumpable). After that get_dumpable() fails. (It is not clear why proc_pid_readlink() checks get_dumpable(), perhaps we could add PTRACE_MODE_NODUMPABLE) Change __ptrace_may_access() to use same_thread_group() instead of "task == current". Any security check is pointless when the tasks share the same ->mm. Signed-off-by: Mark Grondona Signed-off-by: Ben Woodard Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index a146ee327f6a..dd562e9aa2c8 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -236,7 +236,7 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) */ int dumpable = 0; /* Don't let security modules deny introspection */ - if (task == current) + if (same_thread_group(task, current)) return 0; rcu_read_lock(); tcred = __task_cred(task); -- cgit v1.3-8-gc7d7 From 80c74f6a40284c5c5d49f3b3289172bbce0b30b8 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Wed, 11 Sep 2013 14:24:47 -0700 Subject: kexec: remove unnecessary return Code can not run here forever, so remove the unnecessary return. Signed-off-by: Xishi Qiu Suggested-by: Zhang Yanfei Reviewed-by: Simon Horman Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 59f7b55ba745..2a74f307c5ec 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1474,11 +1474,8 @@ static int __init __parse_crashkernel(char *cmdline, if (first_colon && (!first_space || first_colon < first_space)) return parse_crashkernel_mem(ck_cmdline, system_ram, crash_size, crash_base); - else - return parse_crashkernel_simple(ck_cmdline, crash_size, - crash_base); - return 0; + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); } /* -- cgit v1.3-8-gc7d7 From 6723734cdff15211bb78aeea76ca847374bd93ae Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 11 Sep 2013 14:25:49 -0700 Subject: panic: call panic handlers before kmsg_dump Since the panic handlers may produce additional information (via printk) for the kernel log, it should be reported as part of the panic output saved by kmsg_dump(). Without this re-ordering, nothing that adds information to a panic will show up in pstore's view when kmsg_dump runs, and is therefore not visible to crash reporting tools that examine pstore output. Signed-off-by: Kees Cook Cc: Anton Vorontsov Cc: Colin Cross Acked-by: Tony Luck Cc: Stephen Boyd Cc: Vikram Mulukutla Cc: Peter Zijlstra Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 801864600514..b6c482ccc5db 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -123,10 +123,14 @@ void panic(const char *fmt, ...) */ smp_send_stop(); - kmsg_dump(KMSG_DUMP_PANIC); - + /* + * Run any panic handlers, including those that might need to + * add information to the kmsg dump output. + */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); if (!panic_blink) -- cgit v1.3-8-gc7d7 From 7bd36014460f793c19e7d6c94dab67b0afcfcb7f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Sep 2013 16:50:56 -0700 Subject: timekeeping: Fix HRTICK related deadlock from ntp lock changes Gerlando Falauto reported that when HRTICK is enabled, it is possible to trigger system deadlocks. These were hard to reproduce, as HRTICK has been broken in the past, but seemed to be connected to the timekeeping_seq lock. Since seqlock/seqcount's aren't supported w/ lockdep, I added some extra spinlock based locking and triggered the following lockdep output: [ 15.849182] ntpd/4062 is trying to acquire lock: [ 15.849765] (&(&pool->lock)->rlock){..-...}, at: [] __queue_work+0x145/0x480 [ 15.850051] [ 15.850051] but task is already holding lock: [ 15.850051] (timekeeper_lock){-.-.-.}, at: [] do_adjtimex+0x7f/0x100 [ 15.850051] Chain exists of: &(&pool->lock)->rlock --> &p->pi_lock --> timekeeper_lock [ 15.850051] Possible unsafe locking scenario: [ 15.850051] [ 15.850051] CPU0 CPU1 [ 15.850051] ---- ---- [ 15.850051] lock(timekeeper_lock); [ 15.850051] lock(&p->pi_lock); [ 15.850051] lock(timekeeper_lock); [ 15.850051] lock(&(&pool->lock)->rlock); [ 15.850051] [ 15.850051] *** DEADLOCK *** The deadlock was introduced by 06c017fdd4dc48451a ("timekeeping: Hold timekeepering locks in do_adjtimex and hardpps") in 3.10 This patch avoids this deadlock, by moving the call to schedule_delayed_work() outside of the timekeeper lock critical section. Reported-by: Gerlando Falauto Tested-by: Lin Ming Signed-off-by: John Stultz Cc: Mathieu Desnoyers Cc: stable #3.11, 3.10 Link: http://lkml.kernel.org/r/1378943457-27314-1-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/timex.h | 1 + kernel/time/ntp.c | 6 ++---- kernel/time/timekeeping.c | 2 ++ 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/timex.h b/include/linux/timex.h index b3726e61368e..dd3edd7dfc94 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -141,6 +141,7 @@ extern int do_adjtimex(struct timex *); extern void hardpps(const struct timespec *, const struct timespec *); int read_current_timer(unsigned long *timer_val); +void ntp_notify_cmos_timer(void); /* The clock frequency of the i8253/i8254 PIT */ #define PIT_TICK_RATE 1193182ul diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 8f5b3b98577b..bb2215174f05 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -516,13 +516,13 @@ static void sync_cmos_clock(struct work_struct *work) schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next)); } -static void notify_cmos_timer(void) +void ntp_notify_cmos_timer(void) { schedule_delayed_work(&sync_cmos_work, 0); } #else -static inline void notify_cmos_timer(void) { } +void ntp_notify_cmos_timer(void) { } #endif @@ -687,8 +687,6 @@ int __do_adjtimex(struct timex *txc, struct timespec *ts, s32 *time_tai) if (!(time_status & STA_NANO)) txc->time.tv_usec /= NSEC_PER_USEC; - notify_cmos_timer(); - return result; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 48b9fffabdc2..947ba25a95a0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1703,6 +1703,8 @@ int do_adjtimex(struct timex *txc) write_seqcount_end(&timekeeper_seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); + ntp_notify_cmos_timer(); + return ret; } -- cgit v1.3-8-gc7d7 From 878b5a6efd38030c7a90895dc8346e8fb1e09b4c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 11 Sep 2013 17:47:26 +0200 Subject: uprobes: Fix utask->depth accounting in handle_trampoline() Currently utask->depth is simply the number of allocated/pending return_instance's in uprobe_task->return_instances list. handle_trampoline() should decrement this counter every time we handle/free an instance, but due to typo it does this only if ->chained == T. This means that in the likely case this counter is never decremented and the probed task can't report more than MAX_URETPROBE_DEPTH events. Reported-by: Mikhail Kulemin Reported-by: Hemant Kumar Shaw Signed-off-by: Oleg Nesterov Acked-by: Anton Arapov Cc: masami.hiramatsu.pt@hitachi.com Cc: srikar@linux.vnet.ibm.com Cc: systemtap@sourceware.org Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/20130911154726.GA8093@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/uprobes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f3569747d629..ad8e1bdca70e 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1682,12 +1682,10 @@ static bool handle_trampoline(struct pt_regs *regs) tmp = ri; ri = ri->next; kfree(tmp); + utask->depth--; if (!chained) break; - - utask->depth--; - BUG_ON(!ri); } -- cgit v1.3-8-gc7d7 From 6c9a27f5da9609fca46cb2b183724531b48f71ad Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Tue, 10 Sep 2013 18:16:36 +0900 Subject: sched/fair: Fix small race where child->se.parent,cfs_rq might point to invalid ones There is a small race between copy_process() and cgroup_attach_task() where child->se.parent,cfs_rq points to invalid (old) ones. parent doing fork() | someone moving the parent to another cgroup -------------------------------+--------------------------------------------- copy_process() + dup_task_struct() -> parent->se is copied to child->se. se.parent,cfs_rq of them point to old ones. cgroup_attach_task() + cgroup_task_migrate() -> parent->cgroup is updated. + cpu_cgroup_attach() + sched_move_task() + task_move_group_fair() +- set_task_rq() -> se.parent,cfs_rq of parent are updated. + cgroup_fork() -> parent->cgroup is copied to child->cgroup. (*1) + sched_fork() + task_fork_fair() -> se.parent,cfs_rq of child are accessed while they point to old ones. (*2) In the worst case, this bug can lead to "use-after-free" and cause a panic, because it's new cgroup's refcount that is incremented at (*1), so the old cgroup(and related data) can be freed before (*2). In fact, a panic caused by this bug was originally caught in RHEL6.4. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] sched_slice+0x6e/0xa0 [...] Call Trace: [] place_entity+0x75/0xa0 [] task_fork_fair+0xaa/0x160 [] sched_fork+0x6b/0x140 [] copy_process+0x5b2/0x1450 [] ? wake_up_new_task+0xd9/0x130 [] do_fork+0x94/0x460 [] ? sys_wait4+0xae/0x100 [] sys_clone+0x28/0x30 [] stub_clone+0x13/0x20 [] ? system_call_fastpath+0x16/0x1b Signed-off-by: Daisuke Nishimura Signed-off-by: Peter Zijlstra Cc: Link: http://lkml.kernel.org/r/039601ceae06$733d3130$59b79390$@mxp.nes.nec.co.jp Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9b3fe1cd8f40..11cd13667359 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5928,11 +5928,15 @@ static void task_fork_fair(struct task_struct *p) cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - if (unlikely(task_cpu(p) != this_cpu)) { - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - } + /* + * Not only the cpu but also the task_group of the parent might have + * been changed after parent->se.parent,cfs_rq were copied to + * child->se.parent,cfs_rq. So call __set_task_cpu() to make those + * of child point to valid ones. + */ + rcu_read_lock(); + __set_task_cpu(p, this_cpu); + rcu_read_unlock(); update_curr(cfs_rq); -- cgit v1.3-8-gc7d7 From fc840914e9b07ab4685c195e1e54e58de4f84c03 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 9 Sep 2013 13:01:41 +0200 Subject: sched/debug: Take PID namespace into account Emmanuel reported that /proc/sched_debug didn't report the right PIDs when using namespaces, cure this. Reported-by: Emmanuel Deloget Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130909110141.GM31370@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index e076bddd4c66..196559994f7c 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -124,7 +124,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) SEQ_printf(m, " "); SEQ_printf(m, "%15s %5d %9Ld.%06ld %9Ld %5d ", - p->comm, p->pid, + p->comm, task_pid_nr(p), SPLIT_NS(p->se.vruntime), (long long)(p->nvcsw + p->nivcsw), p->prio); @@ -289,7 +289,7 @@ do { \ P(nr_load_updates); P(nr_uninterruptible); PN(next_balance); - P(curr->pid); + SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); P(cpu_load[0]); P(cpu_load[1]); @@ -492,7 +492,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, task_pid_nr(p), get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------" -- cgit v1.3-8-gc7d7 From 6de5a8bfcae6e3b427d642eff078d8305b324b52 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:47 -0700 Subject: memcg: rename RESOURCE_MAX to RES_COUNTER_MAX RESOURCE_MAX is far too general name, change it to RES_COUNTER_MAX. Signed-off-by: Sha Zhengju Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Daisuke Nishimura Cc: Jeff Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/res_counter.h | 2 +- kernel/res_counter.c | 8 ++++---- mm/memcontrol.c | 4 ++-- net/ipv4/tcp_memcontrol.c | 10 +++++----- 4 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index 586bc7cb1dcf..201a69749659 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -54,7 +54,7 @@ struct res_counter { struct res_counter *parent; }; -#define RESOURCE_MAX ULLONG_MAX +#define RES_COUNTER_MAX ULLONG_MAX /** * Helpers to interact with userspace diff --git a/kernel/res_counter.c b/kernel/res_counter.c index ff55247e7049..3f0417f97e76 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -17,8 +17,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent) { spin_lock_init(&counter->lock); - counter->limit = RESOURCE_MAX; - counter->soft_limit = RESOURCE_MAX; + counter->limit = RES_COUNTER_MAX; + counter->soft_limit = RES_COUNTER_MAX; counter->parent = parent; } @@ -182,12 +182,12 @@ int res_counter_memparse_write_strategy(const char *buf, { char *end; - /* return RESOURCE_MAX(unlimited) if "-1" is specified */ + /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { *res = simple_strtoull(buf + 1, &end, 10); if (*res != 1 || *end != '\0') return -EINVAL; - *res = RESOURCE_MAX; + *res = RES_COUNTER_MAX; return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4b5cfb509270..2c71f243186e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4967,7 +4967,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) */ mutex_lock(&memcg_create_mutex); mutex_lock(&set_limit_mutex); - if (!memcg->kmem_account_flags && val != RESOURCE_MAX) { + if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { ret = -EBUSY; goto out; @@ -4977,7 +4977,7 @@ static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) ret = memcg_update_cache_sizes(memcg); if (ret) { - res_counter_set_limit(&memcg->kmem, RESOURCE_MAX); + res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); goto out; } static_key_slow_inc(&memcg_kmem_enabled_key); diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 8a57d79b0b16..559d4ae6ebf4 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c @@ -87,8 +87,8 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) if (!cg_proto) return -EINVAL; - if (val > RESOURCE_MAX) - val = RESOURCE_MAX; + if (val > RES_COUNTER_MAX) + val = RES_COUNTER_MAX; tcp = tcp_from_cgproto(cg_proto); @@ -101,9 +101,9 @@ static int tcp_update_limit(struct mem_cgroup *memcg, u64 val) tcp->tcp_prot_mem[i] = min_t(long, val >> PAGE_SHIFT, net->ipv4.sysctl_tcp_mem[i]); - if (val == RESOURCE_MAX) + if (val == RES_COUNTER_MAX) clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags); - else if (val != RESOURCE_MAX) { + else if (val != RES_COUNTER_MAX) { /* * The active bit needs to be written after the static_key * update. This is what guarantees that the socket activation @@ -187,7 +187,7 @@ static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft) switch (cft->private) { case RES_LIMIT: - val = tcp_read_stat(memcg, RES_LIMIT, RESOURCE_MAX); + val = tcp_read_stat(memcg, RES_LIMIT, RES_COUNTER_MAX); break; case RES_USAGE: val = tcp_read_usage(memcg); -- cgit v1.3-8-gc7d7 From 3af3351676c3deecfd632f47719fb0d13a061ba8 Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:48 -0700 Subject: memcg: avoid overflow caused by PAGE_ALIGN Since PAGE_ALIGN is aligning up(the next page boundary), so after PAGE_ALIGN, the value might be overflow, such as write the MAX value to *.limit_in_bytes. $ cat /cgroup/memory/memory.limit_in_bytes 18446744073709551615 # echo 18446744073709551615 > /cgroup/memory/memory.limit_in_bytes bash: echo: write error: Invalid argument Some user programs might depend on such behaviours(like libcg, we read the value in snapshot, then use the value to reset cgroup later), and that will cause confusion. So we need to fix it. Signed-off-by: Sha Zhengju Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Daisuke Nishimura Cc: Jeff Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 3f0417f97e76..085d3ae478fe 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -195,6 +195,10 @@ int res_counter_memparse_write_strategy(const char *buf, if (*end != '\0') return -EINVAL; - *res = PAGE_ALIGN(*res); + if (PAGE_ALIGN(*res) >= *res) + *res = PAGE_ALIGN(*res); + else + *res = RES_COUNTER_MAX; + return 0; } -- cgit v1.3-8-gc7d7 From 1a36e59d4833de19120dc7482c61ef69e228c73c Mon Sep 17 00:00:00 2001 From: Sha Zhengju Date: Thu, 12 Sep 2013 15:13:49 -0700 Subject: memcg: reduce function dereference This function dereferences res far too often, so optimize it. Signed-off-by: Sha Zhengju Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Daisuke Nishimura Cc: Jeff Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/res_counter.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 085d3ae478fe..4aa8a305aede 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -178,27 +178,30 @@ u64 res_counter_read_u64(struct res_counter *counter, int member) #endif int res_counter_memparse_write_strategy(const char *buf, - unsigned long long *res) + unsigned long long *resp) { char *end; + unsigned long long res; /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - *res = simple_strtoull(buf + 1, &end, 10); - if (*res != 1 || *end != '\0') + res = simple_strtoull(buf + 1, &end, 10); + if (res != 1 || *end != '\0') return -EINVAL; - *res = RES_COUNTER_MAX; + *resp = RES_COUNTER_MAX; return 0; } - *res = memparse(buf, &end); + res = memparse(buf, &end); if (*end != '\0') return -EINVAL; - if (PAGE_ALIGN(*res) >= *res) - *res = PAGE_ALIGN(*res); + if (PAGE_ALIGN(res) >= res) + res = PAGE_ALIGN(res); else - *res = RES_COUNTER_MAX; + res = RES_COUNTER_MAX; + + *resp = res; return 0; } -- cgit v1.3-8-gc7d7 From 6072ddc8520b86adfac6939ca32fb6e6c4de017a Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 12 Sep 2013 15:14:07 -0700 Subject: kernel: replace strict_strto*() with kstrto*() The usage of strict_strto*() is not preferred, because strict_strto*() is obsolete. Thus, kstrto*() should be used. Signed-off-by: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/gcov/fs.c | 2 +- kernel/ksysfs.c | 2 +- kernel/params.c | 14 +++++++------- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c index 9bd0934f6c33..7a7d2ee96d42 100644 --- a/kernel/gcov/fs.c +++ b/kernel/gcov/fs.c @@ -74,7 +74,7 @@ static int __init gcov_persist_setup(char *str) { unsigned long val; - if (strict_strtoul(str, 0, &val)) { + if (kstrtoul(str, 0, &val)) { pr_warning("invalid gcov_persist parameter '%s'\n", str); return 0; } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 6ada93c23a9a..9659d38e008f 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -113,7 +113,7 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, unsigned long cnt; int ret; - if (strict_strtoul(buf, 0, &cnt)) + if (kstrtoul(buf, 0, &cnt)) return -EINVAL; ret = crash_shrink_memory(cnt); diff --git a/kernel/params.c b/kernel/params.c index 501bde4f3bee..81c4e78c8f4c 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -253,13 +253,13 @@ int parse_args(const char *doing, EXPORT_SYMBOL(param_ops_##name) -STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, strict_strtol); -STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, strict_strtol); -STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, strict_strtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, strict_strtol); -STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, strict_strtoul); +STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); +STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); +STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); +STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) { -- cgit v1.3-8-gc7d7 From 0244ad004a54e39308d495fee0a2e637f8b5c317 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Fri, 30 Aug 2013 09:39:53 +0200 Subject: Remove GENERIC_HARDIRQ config option After the last architecture switched to generic hard irqs the config options HAVE_GENERIC_HARDIRQS & GENERIC_HARDIRQS and the related code for !CONFIG_GENERIC_HARDIRQS can be removed. Signed-off-by: Martin Schwidefsky --- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/avr32/Kconfig | 1 - arch/blackfin/Kconfig | 1 - arch/c6x/Kconfig | 1 - arch/cris/Kconfig | 1 - arch/frv/Kconfig | 1 - arch/h8300/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m32r/Kconfig | 1 - arch/m68k/Kconfig | 1 - arch/metag/Kconfig | 1 - arch/microblaze/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/mn10300/Kconfig | 1 - arch/openrisc/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/score/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/tile/Kconfig | 1 - arch/um/Kconfig.common | 1 - arch/um/defconfig | 2 - arch/unicore32/Kconfig | 1 - arch/x86/Kconfig | 1 - arch/xtensa/Kconfig | 1 - arch/xtensa/configs/common_defconfig | 1 - arch/xtensa/configs/iss_defconfig | 1 - arch/xtensa/configs/s6105_defconfig | 1 - drivers/block/mtip32xx/Kconfig | 2 +- drivers/char/random.c | 5 +- drivers/dma/dw/Kconfig | 1 - drivers/gpio/Kconfig | 6 +- drivers/hid/Kconfig | 2 +- drivers/i2c/Kconfig | 1 - drivers/i2c/busses/Kconfig | 6 +- drivers/iio/Kconfig | 1 - drivers/infiniband/hw/qib/Kconfig | 2 +- drivers/input/keyboard/Kconfig | 4 +- drivers/input/serio/Kconfig | 1 - drivers/input/touchscreen/Kconfig | 4 +- drivers/media/platform/Kconfig | 2 +- drivers/media/radio/Kconfig | 2 +- drivers/mfd/Kconfig | 126 ++++++++++++++++----------------- drivers/misc/cb710/Kconfig | 2 +- drivers/mmc/host/Kconfig | 2 +- drivers/net/ethernet/cadence/Kconfig | 2 +- drivers/net/wireless/p54/Kconfig | 2 +- drivers/net/wireless/ti/wl1251/Kconfig | 2 +- drivers/net/wireless/ti/wlcore/Kconfig | 2 +- drivers/pci/msi.c | 22 ------ drivers/power/Kconfig | 3 +- drivers/pps/clients/Kconfig | 2 +- drivers/spi/Kconfig | 3 +- drivers/tty/serial/Kconfig | 2 +- drivers/usb/dwc3/Kconfig | 2 +- drivers/usb/gadget/Kconfig | 4 +- drivers/usb/host/Kconfig | 1 - drivers/usb/musb/Kconfig | 1 - drivers/usb/renesas_usbhs/Kconfig | 2 +- drivers/w1/masters/Kconfig | 2 +- include/linux/cpu_rmap.h | 3 - include/linux/hardirq.h | 4 -- include/linux/interrupt.h | 75 +------------------- include/linux/irq.h | 9 --- include/linux/irqdesc.h | 3 - include/linux/irqnr.h | 19 ----- include/linux/kernel_stat.h | 34 --------- kernel/Makefile | 2 +- kernel/irq/Kconfig | 12 ---- kernel/softirq.c | 2 - lib/Kconfig.debug | 2 +- lib/cpu_rmap.c | 6 -- net/Kconfig | 2 +- sound/soc/codecs/Kconfig | 2 +- sound/soc/samsung/Kconfig | 6 +- 81 files changed, 100 insertions(+), 337 deletions(-) (limited to 'kernel') diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 082d9b4b5472..35a300d4a9fb 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -7,7 +7,6 @@ config ALPHA select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS select HAVE_DMA_ATTRS - select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_PROBE select AUTO_IRQ_AFFINITY if SMP diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 68fcbb2d59e2..91dbb2757afd 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -20,7 +20,6 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK - select HAVE_GENERIC_HARDIRQS select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d13f6743df4b..3f7714d8d2d2 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -37,7 +37,6 @@ config ARM select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL) select HAVE_FUNCTION_TRACER if (!XIP_KERNEL) select HAVE_GENERIC_DMA_COHERENT - select HAVE_GENERIC_HARDIRQS select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7)) select HAVE_IDE if PCI || ISA || PCMCIA select HAVE_IRQ_TIME_ACCOUNTING diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ae323a45c28c..c04454876bcb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -23,7 +23,6 @@ config ARM64 select HAVE_DMA_API_DEBUG select HAVE_DMA_ATTRS select HAVE_GENERIC_DMA_COHERENT - select HAVE_GENERIC_HARDIRQS select HAVE_HW_BREAKPOINT if PERF_EVENTS select HAVE_MEMBLOCK select HAVE_PERF_EVENTS diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig index 549903cfc2cb..b6878eb64884 100644 --- a/arch/avr32/Kconfig +++ b/arch/avr32/Kconfig @@ -6,7 +6,6 @@ config AVR32 select HAVE_CLK select HAVE_OPROFILE select HAVE_KPROBES - select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_ATOMIC64 diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index 3b6abc54b015..f78c9a2c7e28 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -32,7 +32,6 @@ config BLACKFIN select HAVE_UNDERSCORE_SYMBOL_PREFIX select VIRT_TO_BUS select ARCH_WANT_IPC_PARSE_VERSION - select HAVE_GENERIC_HARDIRQS select GENERIC_ATOMIC64 select GENERIC_IRQ_PROBE select USE_GENERIC_SMP_HELPERS if SMP diff --git a/arch/c6x/Kconfig b/arch/c6x/Kconfig index f6a3648f5ec3..957dd00ea561 100644 --- a/arch/c6x/Kconfig +++ b/arch/c6x/Kconfig @@ -10,7 +10,6 @@ config C6X select GENERIC_IRQ_SHOW select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG - select HAVE_GENERIC_HARDIRQS select HAVE_MEMBLOCK select SPARSE_IRQ select IRQ_DOMAIN diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig index c699d3259872..02380bed189c 100644 --- a/arch/cris/Kconfig +++ b/arch/cris/Kconfig @@ -41,7 +41,6 @@ config CRIS default y select HAVE_IDE select GENERIC_ATOMIC64 - select HAVE_GENERIC_HARDIRQS select HAVE_UID16 select VIRT_TO_BUS select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index 4b6628ea381e..34aa19352dc1 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -5,7 +5,6 @@ config FRV select HAVE_ARCH_TRACEHOOK select HAVE_PERF_EVENTS select HAVE_UID16 - select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_SHOW select HAVE_DEBUG_BUGVERBOSE diff --git a/arch/h8300/Kconfig b/arch/h8300/Kconfig index 3d6759ee382f..24b1dc2564f1 100644 --- a/arch/h8300/Kconfig +++ b/arch/h8300/Kconfig @@ -2,7 +2,6 @@ config H8300 bool default y select HAVE_IDE - select HAVE_GENERIC_HARDIRQS select GENERIC_ATOMIC64 select HAVE_UID16 select VIRT_TO_BUS diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 77d442ab28c8..99041b07e610 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -15,7 +15,6 @@ config HEXAGON # select GENERIC_PENDING_IRQ if SMP select GENERIC_ATOMIC64 select HAVE_PERF_EVENTS - select HAVE_GENERIC_HARDIRQS # GENERIC_ALLOCATOR is used by dma_alloc_coherent() select GENERIC_ALLOCATOR select GENERIC_IRQ_SHOW diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index a86a56d9e73f..7740ab10a171 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -21,7 +21,6 @@ config IA64 select HAVE_KVM select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG - select HAVE_GENERIC_HARDIRQS select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_VIRT_CPU_ACCOUNTING diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 29a7ef4e448b..75661fbf4529 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -9,7 +9,6 @@ config M32R select HAVE_KERNEL_LZMA select ARCH_WANT_IPC_PARSE_VERSION select HAVE_DEBUG_BUGVERBOSE - select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index c3cda41af801..311a300d48cc 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -4,7 +4,6 @@ config M68K select HAVE_IDE select HAVE_AOUT if MMU select HAVE_DEBUG_BUGVERBOSE - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_ATOMIC64 select HAVE_UID16 diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index cfd831c29824..36368eb07e13 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -13,7 +13,6 @@ config METAG select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_GENERIC_HARDIRQS select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZO diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 3f6659cbc969..b82f82b74319 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -18,7 +18,6 @@ config MICROBLAZE select ARCH_WANT_IPC_PARSE_VERSION select HAVE_DEBUG_KMEMLEAK select IRQ_DOMAIN - select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index a9668d4653c2..f75ab4a2f246 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -25,7 +25,6 @@ config MIPS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_PCI_IOMAP diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 70e4f663ebd2..6aaa1607001a 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -1,7 +1,6 @@ config MN10300 def_bool y select HAVE_OPROFILE - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select HAVE_ARCH_TRACEHOOK diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index d60bf98fa5cf..9488209a5253 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -11,7 +11,6 @@ config OPENRISC select HAVE_MEMBLOCK select ARCH_REQUIRE_GPIOLIB select HAVE_ARCH_TRACEHOOK - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_CHIP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index aa399a5259b6..ad2ce8dab996 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -14,7 +14,6 @@ config PARISC select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE - select HAVE_GENERIC_HARDIRQS select BROKEN_RODATA select GENERIC_IRQ_PROBE select GENERIC_PCI_IOMAP diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 6b7530f8183c..38f3b7e47ec5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -114,7 +114,6 @@ config PPC select HAVE_PERF_EVENTS select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_HW_BREAKPOINT if PERF_EVENTS && PPC_BOOK3S_64 - select HAVE_GENERIC_HARDIRQS select ARCH_WANT_IPC_PARSE_VERSION select SPARSE_IRQ select IRQ_DOMAIN diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 3ec272859e1e..dcc6ac2d8026 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -116,7 +116,6 @@ config S390 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_TRACE_MCOUNT_TEST - select HAVE_GENERIC_HARDIRQS select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZ4 diff --git a/arch/score/Kconfig b/arch/score/Kconfig index 5fc237581caf..a1be70db75fe 100644 --- a/arch/score/Kconfig +++ b/arch/score/Kconfig @@ -2,7 +2,6 @@ menu "Machine selection" config SCORE def_bool y - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_IOMAP select GENERIC_ATOMIC64 diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 1018ed3a3ca5..224f4bc9925e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -26,7 +26,6 @@ config SUPERH select ARCH_WANT_IPC_PARSE_VERSION select HAVE_SYSCALL_TRACEPOINTS select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_GENERIC_HARDIRQS select MAY_HAVE_SPARSE_IRQ select IRQ_FORCED_THREADING select RTC_LIB diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 1570ad2802b3..2137ad667438 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -26,7 +26,6 @@ config SPARC select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG select HAVE_ARCH_JUMP_LABEL - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION select USE_GENERIC_SMP_HELPERS if SMP diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index 932fa14de5fe..8a7cc663b3f8 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -11,7 +11,6 @@ config TILE select USE_GENERIC_SMP_HELPERS select CC_OPTIMIZE_FOR_SIZE select HAVE_DEBUG_KMEMLEAK - select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index bceee6623b00..8ddea1f8006a 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -6,7 +6,6 @@ config DEFCONFIG_LIST config UML bool default y - select HAVE_GENERIC_HARDIRQS select HAVE_UID16 select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES diff --git a/arch/um/defconfig b/arch/um/defconfig index 08107a795062..2665e6b683f5 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -129,12 +129,10 @@ CONFIG_BSD_PROCESS_ACCT=y # CONFIG_FHANDLE is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set -CONFIG_HAVE_GENERIC_HARDIRQS=y # # IRQ subsystem # -CONFIG_GENERIC_HARDIRQS=y CONFIG_GENERIC_IRQ_SHOW=y # diff --git a/arch/unicore32/Kconfig b/arch/unicore32/Kconfig index 41bcc0013442..82cdd8906f3d 100644 --- a/arch/unicore32/Kconfig +++ b/arch/unicore32/Kconfig @@ -2,7 +2,6 @@ config UNICORE32 def_bool y select HAVE_MEMBLOCK select HAVE_GENERIC_DMA_COHERENT - select HAVE_GENERIC_HARDIRQS select HAVE_DMA_ATTRS select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 30c40f08a3d4..e241a1930c98 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -82,7 +82,6 @@ config X86 select HAVE_USER_RETURN_NOTIFIER select ARCH_BINFMT_ELF_RANDOMIZE_PIE select HAVE_ARCH_JUMP_LABEL - select HAVE_GENERIC_HARDIRQS select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE select SPARSE_IRQ select GENERIC_FIND_FIRST_BIT diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 7ea6451a3a33..8d24dcb7cdac 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -7,7 +7,6 @@ config XTENSA select HAVE_IDE select GENERIC_ATOMIC64 select GENERIC_CLOCKEVENTS - select HAVE_GENERIC_HARDIRQS select VIRT_TO_BUS select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES diff --git a/arch/xtensa/configs/common_defconfig b/arch/xtensa/configs/common_defconfig index a182a4e6d688..f6000fe05119 100644 --- a/arch/xtensa/configs/common_defconfig +++ b/arch/xtensa/configs/common_defconfig @@ -8,7 +8,6 @@ CONFIG_XTENSA=y # CONFIG_UID16 is not set CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_HAVE_DEC_LOCK=y -CONFIG_GENERIC_HARDIRQS=y # # Code maturity level options diff --git a/arch/xtensa/configs/iss_defconfig b/arch/xtensa/configs/iss_defconfig index 77c52f80187a..4f233204faf9 100644 --- a/arch/xtensa/configs/iss_defconfig +++ b/arch/xtensa/configs/iss_defconfig @@ -9,7 +9,6 @@ CONFIG_XTENSA=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_HWEIGHT=y -CONFIG_GENERIC_HARDIRQS=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_NO_IOPORT=y diff --git a/arch/xtensa/configs/s6105_defconfig b/arch/xtensa/configs/s6105_defconfig index 4799c6a526b5..d929f77a0360 100644 --- a/arch/xtensa/configs/s6105_defconfig +++ b/arch/xtensa/configs/s6105_defconfig @@ -9,7 +9,6 @@ CONFIG_XTENSA=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_GENERIC_FIND_NEXT_BIT=y CONFIG_GENERIC_HWEIGHT=y -CONFIG_GENERIC_HARDIRQS=y # CONFIG_ARCH_HAS_ILOG2_U32 is not set # CONFIG_ARCH_HAS_ILOG2_U64 is not set CONFIG_NO_IOPORT=y diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig index 1fca1f996b45..0ba837fc62a8 100644 --- a/drivers/block/mtip32xx/Kconfig +++ b/drivers/block/mtip32xx/Kconfig @@ -4,6 +4,6 @@ config BLK_DEV_PCIESSD_MTIP32XX tristate "Block Device Driver for Micron PCIe SSDs" - depends on PCI && GENERIC_HARDIRQS + depends on PCI help This enables the block driver for Micron PCIe SSDs. diff --git a/drivers/char/random.c b/drivers/char/random.c index 0d91fe52f3f5..7737b5bd26af 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -255,10 +255,7 @@ #include #include #include - -#ifdef CONFIG_GENERIC_HARDIRQS -# include -#endif +#include #include #include diff --git a/drivers/dma/dw/Kconfig b/drivers/dma/dw/Kconfig index dde13248b681..dcfe964cc8dc 100644 --- a/drivers/dma/dw/Kconfig +++ b/drivers/dma/dw/Kconfig @@ -4,7 +4,6 @@ config DW_DMAC_CORE tristate "Synopsys DesignWare AHB DMA support" - depends on GENERIC_HARDIRQS select DMA_ENGINE config DW_DMAC diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig index 5cb218151c9c..b6ed304863eb 100644 --- a/drivers/gpio/Kconfig +++ b/drivers/gpio/Kconfig @@ -322,7 +322,7 @@ config GPIO_ICH config GPIO_VX855 tristate "VIA VX855/VX875 GPIO" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select MFD_CORE select MFD_VX855 help @@ -396,7 +396,7 @@ config GPIO_MAX732X config GPIO_MAX732X_IRQ bool "Interrupt controller support for MAX732x" - depends on GPIO_MAX732X=y && GENERIC_HARDIRQS + depends on GPIO_MAX732X=y help Say yes here to enable the max732x to be used as an interrupt controller. It requires the driver to be built in the kernel. @@ -661,7 +661,7 @@ config GPIO_TIMBERDALE config GPIO_RDC321X tristate "RDC R-321x GPIO support" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select MFD_CORE select MFD_RDC321X help diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig index 3d7c9f67b6d7..71b70e3a7a71 100644 --- a/drivers/hid/Kconfig +++ b/drivers/hid/Kconfig @@ -773,7 +773,7 @@ config HID_ZYDACRON config HID_SENSOR_HUB tristate "HID Sensors framework support" - depends on HID && GENERIC_HARDIRQS + depends on HID select MFD_CORE default n ---help--- diff --git a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig index e380c6eef3af..7b7ea320a258 100644 --- a/drivers/i2c/Kconfig +++ b/drivers/i2c/Kconfig @@ -75,7 +75,6 @@ config I2C_HELPER_AUTO config I2C_SMBUS tristate "SMBus-specific protocols" if !I2C_HELPER_AUTO - depends on GENERIC_HARDIRQS help Say Y here if you want support for SMBus extensions to the I2C specification. At the moment, the only supported extension is diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index fcdd321f709e..cdcbd8368ed3 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -115,7 +115,7 @@ config I2C_I801 config I2C_ISCH tristate "Intel SCH SMBus 1.0" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select LPC_SCH help Say Y here if you want to use SMBus controller on the Intel SCH @@ -546,7 +546,6 @@ config I2C_NUC900 config I2C_OCORES tristate "OpenCores I2C Controller" - depends on GENERIC_HARDIRQS help If you say yes to this option, support will be included for the OpenCores I2C controller. For details see @@ -791,7 +790,7 @@ config I2C_DIOLAN_U2C config I2C_PARPORT tristate "Parallel port adapter" - depends on PARPORT && GENERIC_HARDIRQS + depends on PARPORT select I2C_ALGOBIT select I2C_SMBUS help @@ -816,7 +815,6 @@ config I2C_PARPORT config I2C_PARPORT_LIGHT tristate "Parallel port adapter (light)" - depends on GENERIC_HARDIRQS select I2C_ALGOBIT select I2C_SMBUS help diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig index cbea3271c1b1..90cf0cda50c4 100644 --- a/drivers/iio/Kconfig +++ b/drivers/iio/Kconfig @@ -4,7 +4,6 @@ menuconfig IIO tristate "Industrial I/O support" - depends on GENERIC_HARDIRQS help The industrial I/O subsystem provides a unified framework for drivers for many different types of embedded sensors using a diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig index d03ca4c1ff25..495be09781b1 100644 --- a/drivers/infiniband/hw/qib/Kconfig +++ b/drivers/infiniband/hw/qib/Kconfig @@ -8,7 +8,7 @@ config INFINIBAND_QIB config INFINIBAND_QIB_DCA bool "QIB DCA support" - depends on INFINIBAND_QIB && DCA && SMP && GENERIC_HARDIRQS && !(INFINIBAND_QIB=y && DCA=m) + depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m) default y ---help--- Setting this enables DCA support on some Intel chip sets diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index 269d4c3658cb..c1edd39bc5ba 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -224,7 +224,7 @@ config KEYBOARD_TCA6416 config KEYBOARD_TCA8418 tristate "TCA8418 Keypad Support" - depends on I2C && GENERIC_HARDIRQS + depends on I2C select INPUT_MATRIXKMAP help This driver implements basic keypad functionality @@ -303,7 +303,7 @@ config KEYBOARD_HP7XX config KEYBOARD_LM8323 tristate "LM8323 keypad chip" - depends on I2C && GENERIC_HARDIRQS + depends on I2C depends on LEDS_CLASS help If you say yes here you get support for the National Semiconductor diff --git a/drivers/input/serio/Kconfig b/drivers/input/serio/Kconfig index 1e691a3a79cb..33b3e88fe4a2 100644 --- a/drivers/input/serio/Kconfig +++ b/drivers/input/serio/Kconfig @@ -239,7 +239,6 @@ config SERIO_PS2MULT config SERIO_ARC_PS2 tristate "ARC PS/2 support" - depends on GENERIC_HARDIRQS help Say Y here if you have an ARC FPGA platform with a PS/2 controller in it. diff --git a/drivers/input/touchscreen/Kconfig b/drivers/input/touchscreen/Kconfig index 3b9758b5f4d7..e09ec67957a3 100644 --- a/drivers/input/touchscreen/Kconfig +++ b/drivers/input/touchscreen/Kconfig @@ -389,7 +389,7 @@ config TOUCHSCREEN_MCS5000 config TOUCHSCREEN_MMS114 tristate "MELFAS MMS114 touchscreen" - depends on I2C && GENERIC_HARDIRQS + depends on I2C help Say Y here if you have the MELFAS MMS114 touchscreen controller chip in your system. @@ -845,7 +845,7 @@ config TOUCHSCREEN_TSC_SERIO config TOUCHSCREEN_TSC2005 tristate "TSC2005 based touchscreens" - depends on SPI_MASTER && GENERIC_HARDIRQS + depends on SPI_MASTER help Say Y here if you have a TSC2005 based touchscreen. diff --git a/drivers/media/platform/Kconfig b/drivers/media/platform/Kconfig index 8068d7b64155..c7caf94621b4 100644 --- a/drivers/media/platform/Kconfig +++ b/drivers/media/platform/Kconfig @@ -203,7 +203,7 @@ config VIDEO_SAMSUNG_EXYNOS_GSC config VIDEO_SH_VEU tristate "SuperH VEU mem2mem video processing driver" - depends on VIDEO_DEV && VIDEO_V4L2 && GENERIC_HARDIRQS && HAS_DMA + depends on VIDEO_DEV && VIDEO_V4L2 && HAS_DMA select VIDEOBUF2_DMA_CONTIG select V4L2_MEM2MEM_DEV help diff --git a/drivers/media/radio/Kconfig b/drivers/media/radio/Kconfig index 39882ddd2594..6ecdc39bb366 100644 --- a/drivers/media/radio/Kconfig +++ b/drivers/media/radio/Kconfig @@ -214,7 +214,7 @@ config RADIO_TIMBERDALE config RADIO_WL1273 tristate "Texas Instruments WL1273 I2C FM Radio" - depends on I2C && VIDEO_V4L2 && GENERIC_HARDIRQS + depends on I2C && VIDEO_V4L2 select MFD_CORE select MFD_WL1273_CORE select FW_LOADER diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig index e0e46f50f95d..914c3d142f78 100644 --- a/drivers/mfd/Kconfig +++ b/drivers/mfd/Kconfig @@ -23,7 +23,7 @@ config MFD_AS3711 select MFD_CORE select REGMAP_I2C select REGMAP_IRQ - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help Support for the AS3711 PMIC from AMS @@ -40,7 +40,7 @@ config PMIC_ADP5520 config MFD_AAT2870_CORE bool "AnalogicTech AAT2870" select MFD_CORE - depends on I2C=y && GPIOLIB && GENERIC_HARDIRQS + depends on I2C=y && GPIOLIB help If you say yes here you get support for the AAT2870. This driver provides common support for accessing the device, @@ -78,7 +78,7 @@ config MFD_CROS_EC_SPI config MFD_ASIC3 bool "Compaq ASIC3" - depends on GENERIC_HARDIRQS && GPIOLIB && ARM + depends on GPIOLIB && ARM select MFD_CORE ---help--- This driver supports the ASIC3 multifunction chip found on many @@ -104,7 +104,7 @@ config MFD_DA9052_SPI select REGMAP_SPI select REGMAP_IRQ select PMIC_DA9052 - depends on SPI_MASTER=y && GENERIC_HARDIRQS + depends on SPI_MASTER=y help Support for the Dialog Semiconductor DA9052 PMIC when controlled using SPI. This driver provides common support @@ -116,7 +116,7 @@ config MFD_DA9052_I2C select REGMAP_I2C select REGMAP_IRQ select PMIC_DA9052 - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help Support for the Dialog Semiconductor DA9052 PMIC when controlled using I2C. This driver provides common support @@ -128,7 +128,7 @@ config MFD_DA9055 select REGMAP_I2C select REGMAP_IRQ select MFD_CORE - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help Say yes here for support of Dialog Semiconductor DA9055. This is a Power Management IC. This driver provides common support for @@ -144,7 +144,7 @@ config MFD_DA9063 select MFD_CORE select REGMAP_I2C select REGMAP_IRQ - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help Say yes here for support for the Dialog Semiconductor DA9063 PMIC. This includes the I2C driver and core APIs. @@ -156,7 +156,7 @@ config MFD_MC13783 config MFD_MC13XXX tristate - depends on (SPI_MASTER || I2C) && GENERIC_HARDIRQS + depends on (SPI_MASTER || I2C) select MFD_CORE select MFD_MC13783 help @@ -167,7 +167,7 @@ config MFD_MC13XXX config MFD_MC13XXX_SPI tristate "Freescale MC13783 and MC13892 SPI interface" - depends on SPI_MASTER && GENERIC_HARDIRQS + depends on SPI_MASTER select REGMAP_SPI select MFD_MC13XXX help @@ -175,7 +175,7 @@ config MFD_MC13XXX_SPI config MFD_MC13XXX_I2C tristate "Freescale MC13892 I2C interface" - depends on I2C && GENERIC_HARDIRQS + depends on I2C select REGMAP_I2C select MFD_MC13XXX help @@ -183,7 +183,7 @@ config MFD_MC13XXX_I2C config HTC_EGPIO bool "HTC EGPIO support" - depends on GENERIC_HARDIRQS && GPIOLIB && ARM + depends on GPIOLIB && ARM help This driver supports the CPLD egpio chip present on several HTC phones. It provides basic support for input @@ -192,7 +192,6 @@ config HTC_EGPIO config HTC_PASIC3 tristate "HTC PASIC3 LED/DS1WM chip support" select MFD_CORE - depends on GENERIC_HARDIRQS help This core driver provides register access for the LED/DS1WM chips labeled "AIC2" and "AIC3", found on HTC Blueangel and @@ -210,7 +209,7 @@ config HTC_I2CPLD config LPC_ICH tristate "Intel ICH LPC" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select MFD_CORE help The LPC bridge function of the Intel ICH provides support for @@ -220,7 +219,7 @@ config LPC_ICH config LPC_SCH tristate "Intel SCH LPC" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select MFD_CORE help LPC bridge function of the Intel SCH provides support for @@ -238,7 +237,7 @@ config MFD_INTEL_MSIC config MFD_JANZ_CMODIO tristate "Janz CMOD-IO PCI MODULbus Carrier Board" select MFD_CORE - depends on PCI && GENERIC_HARDIRQS + depends on PCI help This is the core driver for the Janz CMOD-IO PCI MODULbus carrier board. This device is a PCI to MODULbus bridge which may @@ -277,7 +276,7 @@ config MFD_KEMPLD config MFD_88PM800 tristate "Marvell 88PM800" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select REGMAP_I2C select REGMAP_IRQ select MFD_CORE @@ -289,7 +288,7 @@ config MFD_88PM800 config MFD_88PM805 tristate "Marvell 88PM805" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select REGMAP_I2C select REGMAP_IRQ select MFD_CORE @@ -301,7 +300,7 @@ config MFD_88PM805 config MFD_88PM860X bool "Marvell 88PM8606/88PM8607" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select REGMAP_I2C select MFD_CORE help @@ -312,7 +311,7 @@ config MFD_88PM860X config MFD_MAX77686 bool "Maxim Semiconductor MAX77686 PMIC Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C select IRQ_DOMAIN @@ -325,7 +324,7 @@ config MFD_MAX77686 config MFD_MAX77693 bool "Maxim Semiconductor MAX77693 PMIC Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C help @@ -339,7 +338,7 @@ config MFD_MAX77693 config MFD_MAX8907 tristate "Maxim Semiconductor MAX8907 PMIC Support" select MFD_CORE - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select REGMAP_I2C select REGMAP_IRQ help @@ -350,7 +349,7 @@ config MFD_MAX8907 config MFD_MAX8925 bool "Maxim Semiconductor MAX8925 PMIC Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE help Say yes here to support for Maxim Semiconductor MAX8925. This is @@ -360,7 +359,7 @@ config MFD_MAX8925 config MFD_MAX8997 bool "Maxim Semiconductor MAX8997/8966 PMIC Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select IRQ_DOMAIN help @@ -373,7 +372,7 @@ config MFD_MAX8997 config MFD_MAX8998 bool "Maxim Semiconductor MAX8998/National LP3974 PMIC Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select IRQ_DOMAIN help @@ -385,7 +384,7 @@ config MFD_MAX8998 config EZX_PCAP bool "Motorola EZXPCAP Support" - depends on GENERIC_HARDIRQS && SPI_MASTER + depends on SPI_MASTER help This enables the PCAP ASIC present on EZX Phones. This is needed for MMC, TouchScreen, Sound, USB, etc.. @@ -393,7 +392,7 @@ config EZX_PCAP config MFD_VIPERBOARD tristate "Nano River Technologies Viperboard" select MFD_CORE - depends on USB && GENERIC_HARDIRQS + depends on USB default n help Say yes here if you want support for Nano River Technologies @@ -407,7 +406,7 @@ config MFD_VIPERBOARD config MFD_RETU tristate "Nokia Retu and Tahvo multi-function device" select MFD_CORE - depends on I2C && GENERIC_HARDIRQS + depends on I2C select REGMAP_IRQ help Retu and Tahvo are a multi-function devices found on Nokia @@ -480,7 +479,7 @@ config MFD_PM8XXX_IRQ config MFD_RDC321X tristate "RDC R-321x southbridge" select MFD_CORE - depends on PCI && GENERIC_HARDIRQS + depends on PCI help Say yes here if you want to have support for the RDC R-321x SoC southbridge which provides access to GPIOs and Watchdog using the @@ -488,7 +487,7 @@ config MFD_RDC321X config MFD_RTSX_PCI tristate "Realtek PCI-E card reader" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select MFD_CORE help This supports for Realtek PCI-Express card reader including rts5209, @@ -498,7 +497,7 @@ config MFD_RTSX_PCI config MFD_RC5T583 bool "Ricoh RC5T583 Power Management system device" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C help @@ -512,7 +511,7 @@ config MFD_RC5T583 config MFD_SEC_CORE bool "SAMSUNG Electronics PMIC Series Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C select REGMAP_IRQ @@ -555,7 +554,7 @@ config MFD_SM501_GPIO config MFD_SMSC bool "SMSC ECE1099 series chips" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C help @@ -577,7 +576,7 @@ config ABX500_CORE config AB3100_CORE bool "ST-Ericsson AB3100 Mixed Signal Circuit core functions" - depends on I2C=y && ABX500_CORE && GENERIC_HARDIRQS + depends on I2C=y && ABX500_CORE select MFD_CORE default y if ARCH_U300 help @@ -601,7 +600,7 @@ config AB3100_OTP config AB8500_CORE bool "ST-Ericsson AB8500 Mixed Signal Power Management chip" - depends on GENERIC_HARDIRQS && ABX500_CORE && MFD_DB8500_PRCMU + depends on ABX500_CORE && MFD_DB8500_PRCMU select POWER_SUPPLY select MFD_CORE select IRQ_DOMAIN @@ -639,7 +638,7 @@ config MFD_DB8500_PRCMU config MFD_STMPE bool "STMicroelectronics STMPE" - depends on (I2C=y || SPI_MASTER=y) && GENERIC_HARDIRQS + depends on (I2C=y || SPI_MASTER=y) select MFD_CORE help Support for the STMPE family of I/O Expanders from @@ -680,7 +679,7 @@ endmenu config MFD_STA2X11 bool "STMicroelectronics STA2X11" - depends on STA2X11 && GENERIC_HARDIRQS + depends on STA2X11 select MFD_CORE select REGMAP_MMIO @@ -700,7 +699,6 @@ config MFD_TI_AM335X_TSCADC select MFD_CORE select REGMAP select REGMAP_MMIO - depends on GENERIC_HARDIRQS help If you say yes here you get support for Texas Instruments series of Touch Screen /ADC chips. @@ -717,7 +715,7 @@ config MFD_DM355EVM_MSP config MFD_LP8788 bool "TI LP8788 Power Management Unit Driver" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C select IRQ_DOMAIN @@ -739,14 +737,14 @@ config MFD_PALMAS select MFD_CORE select REGMAP_I2C select REGMAP_IRQ - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help If you say yes here you get support for the Palmas series of PMIC chips from Texas Instruments. config MFD_TI_SSP tristate "TI Sequencer Serial Port support" - depends on ARCH_DAVINCI_TNETV107X && GENERIC_HARDIRQS + depends on ARCH_DAVINCI_TNETV107X select MFD_CORE ---help--- Say Y here if you want support for the Sequencer Serial Port @@ -761,7 +759,6 @@ config TPS6105X select REGULATOR select MFD_CORE select REGULATOR_FIXED_VOLTAGE - depends on GENERIC_HARDIRQS help This option enables a driver for the TP61050/TPS61052 high-power "white LED driver". This boost converter is @@ -784,7 +781,7 @@ config TPS65010 config TPS6507X tristate "TI TPS6507x Power Management / Touch Screen chips" select MFD_CORE - depends on I2C && GENERIC_HARDIRQS + depends on I2C help If you say yes here you get support for the TPS6507x series of Power Management / Touch Screen chips. These include voltage @@ -798,7 +795,7 @@ config TPS65911_COMPARATOR config MFD_TPS65090 bool "TI TPS65090 Power Management chips" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C select REGMAP_IRQ @@ -811,7 +808,7 @@ config MFD_TPS65090 config MFD_TPS65217 tristate "TI TPS65217 Power Management / White LED chips" - depends on I2C && GENERIC_HARDIRQS + depends on I2C select MFD_CORE select REGMAP_I2C help @@ -826,7 +823,7 @@ config MFD_TPS65217 config MFD_TPS6586X bool "TI TPS6586x Power Management chips" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C help @@ -841,7 +838,7 @@ config MFD_TPS6586X config MFD_TPS65910 bool "TI TPS65910 Power Management chip" - depends on I2C=y && GPIOLIB && GENERIC_HARDIRQS + depends on I2C=y && GPIOLIB select MFD_CORE select REGMAP_I2C select REGMAP_IRQ @@ -862,7 +859,7 @@ config MFD_TPS65912_I2C bool "TI TPS65912 Power Management chip with I2C" select MFD_CORE select MFD_TPS65912 - depends on I2C=y && GPIOLIB && GENERIC_HARDIRQS + depends on I2C=y && GPIOLIB help If you say yes here you get support for the TPS65912 series of PM chips with I2C interface. @@ -871,14 +868,14 @@ config MFD_TPS65912_SPI bool "TI TPS65912 Power Management chip with SPI" select MFD_CORE select MFD_TPS65912 - depends on SPI_MASTER && GPIOLIB && GENERIC_HARDIRQS + depends on SPI_MASTER && GPIOLIB help If you say yes here you get support for the TPS65912 series of PM chips with SPI interface. config MFD_TPS80031 bool "TI TPS80031/TPS80032 Power Management chips" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C select REGMAP_IRQ @@ -892,7 +889,7 @@ config MFD_TPS80031 config TWL4030_CORE bool "TI TWL4030/TWL5030/TWL6030/TPS659x0 Support" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select IRQ_DOMAIN select REGMAP_I2C help @@ -931,13 +928,13 @@ config TWL4030_POWER config MFD_TWL4030_AUDIO bool "TI TWL4030 Audio" - depends on TWL4030_CORE && GENERIC_HARDIRQS + depends on TWL4030_CORE select MFD_CORE default n config TWL6040_CORE bool "TI TWL6040 audio codec" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE select REGMAP_I2C select REGMAP_IRQ @@ -961,7 +958,7 @@ config MENELAUS config MFD_WL1273_CORE tristate "TI WL1273 FM radio" - depends on I2C && GENERIC_HARDIRQS + depends on I2C select MFD_CORE default n help @@ -974,7 +971,6 @@ config MFD_LM3533 depends on I2C select MFD_CORE select REGMAP_I2C - depends on GENERIC_HARDIRQS help Say yes here to enable support for National Semiconductor / TI LM3533 Lighting Power chips. @@ -996,7 +992,7 @@ config MFD_TIMBERDALE config MFD_TC3589X bool "Toshiba TC35892 and variants" - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_CORE help Support for the Toshiba TC35892 and variants I/O Expander. @@ -1011,7 +1007,7 @@ config MFD_TMIO config MFD_T7L66XB bool "Toshiba T7L66XB" - depends on ARM && HAVE_CLK && GENERIC_HARDIRQS + depends on ARM && HAVE_CLK select MFD_CORE select MFD_TMIO help @@ -1036,7 +1032,7 @@ config MFD_TC6393XB config MFD_VX855 tristate "VIA VX855/VX875 integrated south bridge" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select MFD_CORE help Say yes here to enable support for various functions of the @@ -1054,7 +1050,7 @@ config MFD_ARIZONA_I2C select MFD_ARIZONA select MFD_CORE select REGMAP_I2C - depends on I2C && GENERIC_HARDIRQS + depends on I2C help Support for the Wolfson Microelectronics Arizona platform audio SoC core functionality controlled via I2C. @@ -1064,7 +1060,7 @@ config MFD_ARIZONA_SPI select MFD_ARIZONA select MFD_CORE select REGMAP_SPI - depends on SPI_MASTER && GENERIC_HARDIRQS + depends on SPI_MASTER help Support for the Wolfson Microelectronics Arizona platform audio SoC core functionality controlled via I2C. @@ -1090,7 +1086,7 @@ config MFD_WM8997 config MFD_WM8400 bool "Wolfson Microelectronics WM8400" select MFD_CORE - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select REGMAP_I2C help Support for the Wolfson Microelecronics WM8400 PMIC and audio @@ -1100,7 +1096,6 @@ config MFD_WM8400 config MFD_WM831X bool - depends on GENERIC_HARDIRQS config MFD_WM831X_I2C bool "Wolfson Microelectronics WM831x/2x PMICs with I2C" @@ -1108,7 +1103,7 @@ config MFD_WM831X_I2C select MFD_WM831X select REGMAP_I2C select IRQ_DOMAIN - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help Support for the Wolfson Microelecronics WM831x and WM832x PMICs when controlled using I2C. This driver provides common support @@ -1121,7 +1116,7 @@ config MFD_WM831X_SPI select MFD_WM831X select REGMAP_SPI select IRQ_DOMAIN - depends on SPI_MASTER && GENERIC_HARDIRQS + depends on SPI_MASTER help Support for the Wolfson Microelecronics WM831x and WM832x PMICs when controlled using SPI. This driver provides common support @@ -1130,12 +1125,11 @@ config MFD_WM831X_SPI config MFD_WM8350 bool - depends on GENERIC_HARDIRQS config MFD_WM8350_I2C bool "Wolfson Microelectronics WM8350 with I2C" select MFD_WM8350 - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help The WM8350 is an integrated audio and power management subsystem with watchdog and RTC functionality for embedded @@ -1148,7 +1142,7 @@ config MFD_WM8994 select MFD_CORE select REGMAP_I2C select REGMAP_IRQ - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y help The WM8994 is a highly integrated hi-fi CODEC designed for smartphone applicatiosn. As well as audio functionality it diff --git a/drivers/misc/cb710/Kconfig b/drivers/misc/cb710/Kconfig index 5acb9c5b49c4..22429b8b1068 100644 --- a/drivers/misc/cb710/Kconfig +++ b/drivers/misc/cb710/Kconfig @@ -1,6 +1,6 @@ config CB710_CORE tristate "ENE CB710/720 Flash memory card reader support" - depends on PCI && GENERIC_HARDIRQS + depends on PCI help This option enables support for PCI ENE CB710/720 Flash memory card reader found in some laptops (ie. some versions of HP Compaq nx9500). diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index b7fd5ab80a48..7fc5099e44b2 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -487,7 +487,7 @@ config MMC_SDHI config MMC_CB710 tristate "ENE CB710 MMC/SD Interface support" - depends on PCI && GENERIC_HARDIRQS + depends on PCI select CB710_CORE help This option enables support for MMC/SD part of ENE CB710/720 Flash diff --git a/drivers/net/ethernet/cadence/Kconfig b/drivers/net/ethernet/cadence/Kconfig index 8030cc0396fd..751d5c7b312d 100644 --- a/drivers/net/ethernet/cadence/Kconfig +++ b/drivers/net/ethernet/cadence/Kconfig @@ -22,7 +22,7 @@ if NET_CADENCE config ARM_AT91_ETHER tristate "AT91RM9200 Ethernet support" - depends on GENERIC_HARDIRQS && HAS_DMA + depends on HAS_DMA select MACB ---help--- If you wish to compile a kernel for the AT91RM9200 and enable diff --git a/drivers/net/wireless/p54/Kconfig b/drivers/net/wireless/p54/Kconfig index 15ea36b51a66..cdafb8c73e82 100644 --- a/drivers/net/wireless/p54/Kconfig +++ b/drivers/net/wireless/p54/Kconfig @@ -41,7 +41,7 @@ config P54_PCI config P54_SPI tristate "Prism54 SPI (stlc45xx) support" - depends on P54_COMMON && SPI_MASTER && GENERIC_HARDIRQS + depends on P54_COMMON && SPI_MASTER ---help--- This driver is for stlc4550 or stlc4560 based wireless chips such as Nokia's N800/N810 Portable Internet Tablet. diff --git a/drivers/net/wireless/ti/wl1251/Kconfig b/drivers/net/wireless/ti/wl1251/Kconfig index 8fec4ed36ac2..477a206c098e 100644 --- a/drivers/net/wireless/ti/wl1251/Kconfig +++ b/drivers/net/wireless/ti/wl1251/Kconfig @@ -1,6 +1,6 @@ menuconfig WL1251 tristate "TI wl1251 driver support" - depends on MAC80211 && GENERIC_HARDIRQS + depends on MAC80211 select FW_LOADER select CRC7 ---help--- diff --git a/drivers/net/wireless/ti/wlcore/Kconfig b/drivers/net/wireless/ti/wlcore/Kconfig index 2b832825c3d4..7c099542b214 100644 --- a/drivers/net/wireless/ti/wlcore/Kconfig +++ b/drivers/net/wireless/ti/wlcore/Kconfig @@ -1,6 +1,6 @@ config WLCORE tristate "TI wlcore support" - depends on WL_TI && GENERIC_HARDIRQS && MAC80211 + depends on WL_TI && MAC80211 select FW_LOADER ---help--- This module contains the main code for TI WLAN chips. It abstracts diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c index b35f93c232cf..d5f90d6383bc 100644 --- a/drivers/pci/msi.c +++ b/drivers/pci/msi.c @@ -30,7 +30,6 @@ static int pci_msi_enable = 1; /* Arch hooks */ -#if defined(CONFIG_GENERIC_HARDIRQS) int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { struct msi_chip *chip = dev->bus->msi; @@ -67,21 +66,6 @@ int __weak arch_msi_check_device(struct pci_dev *dev, int nvec, int type) return chip->check_device(chip, dev, nvec, type); } -#else -int __weak arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - return -ENOSYS; -} - -void __weak arch_teardown_msi_irq(unsigned int irq) -{ -} - -int __weak arch_msi_check_device(struct pci_dev *dev, int nvec, int type) -{ - return 0; -} -#endif /* CONFIG_GENERIC_HARDIRQS */ int __weak arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { @@ -245,8 +229,6 @@ static void msix_mask_irq(struct msi_desc *desc, u32 flag) desc->masked = __msix_mask_irq(desc, flag); } -#ifdef CONFIG_GENERIC_HARDIRQS - static void msi_set_mask_bit(struct irq_data *data, u32 flag) { struct msi_desc *desc = irq_data_get_msi(data); @@ -270,8 +252,6 @@ void unmask_msi_irq(struct irq_data *data) msi_set_mask_bit(data, 0); } -#endif /* CONFIG_GENERIC_HARDIRQS */ - void __read_msi_msg(struct msi_desc *entry, struct msi_msg *msg) { BUG_ON(entry->dev->current_state != PCI_D0); @@ -382,10 +362,8 @@ static void free_msi_irqs(struct pci_dev *dev) nvec = entry->nvec_used; else nvec = 1 << entry->msi_attrib.multiple; -#ifdef CONFIG_GENERIC_HARDIRQS for (i = 0; i < nvec; i++) BUG_ON(irq_has_action(entry->irq + i)); -#endif } arch_teardown_msi_irqs(dev); diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index bb49ab684f9a..e6f92b450913 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -269,7 +269,6 @@ config CHARGER_ISP1704 config CHARGER_MAX8903 tristate "MAX8903 Battery DC-DC Charger for USB and Adapter Power" - depends on GENERIC_HARDIRQS help Say Y to enable support for the MAX8903 DC-DC charger and sysfs. The driver supports controlling charger-enable and current-limit @@ -370,7 +369,7 @@ config AB8500_BM config BATTERY_GOLDFISH tristate "Goldfish battery driver" - depends on GENERIC_HARDIRQS && (GOLDFISH || COMPILE_TEST) + depends on GOLDFISH || COMPILE_TEST help Say Y to enable support for the battery and AC power in the Goldfish emulator. diff --git a/drivers/pps/clients/Kconfig b/drivers/pps/clients/Kconfig index 6efd9b60d8ff..0c9f2805d076 100644 --- a/drivers/pps/clients/Kconfig +++ b/drivers/pps/clients/Kconfig @@ -31,7 +31,7 @@ config PPS_CLIENT_PARPORT config PPS_CLIENT_GPIO tristate "PPS client using GPIO" - depends on PPS && GENERIC_HARDIRQS + depends on PPS help If you say yes here you get support for a PPS source using GPIO. To be useful you must also register a platform device diff --git a/drivers/spi/Kconfig b/drivers/spi/Kconfig index 0170d4c4a8a3..b9c53cc40e1f 100644 --- a/drivers/spi/Kconfig +++ b/drivers/spi/Kconfig @@ -55,7 +55,6 @@ comment "SPI Master Controller Drivers" config SPI_ALTERA tristate "Altera SPI Controller" - depends on GENERIC_HARDIRQS select SPI_BITBANG help This is the driver for the Altera SPI Controller. @@ -358,7 +357,7 @@ config SPI_PXA2XX_DMA config SPI_PXA2XX tristate "PXA2xx SSP SPI master" - depends on (ARCH_PXA || PCI || ACPI) && GENERIC_HARDIRQS + depends on (ARCH_PXA || PCI || ACPI) select PXA_SSP if ARCH_PXA help This enables using a PXA2xx or Sodaville SSP port as a SPI master diff --git a/drivers/tty/serial/Kconfig b/drivers/tty/serial/Kconfig index 47c6e7b9e150..febd45cd5027 100644 --- a/drivers/tty/serial/Kconfig +++ b/drivers/tty/serial/Kconfig @@ -5,7 +5,7 @@ if TTY menu "Serial drivers" - depends on HAS_IOMEM && GENERIC_HARDIRQS + depends on HAS_IOMEM source "drivers/tty/serial/8250/Kconfig" diff --git a/drivers/usb/dwc3/Kconfig b/drivers/usb/dwc3/Kconfig index f969ea266acb..b870872e020f 100644 --- a/drivers/usb/dwc3/Kconfig +++ b/drivers/usb/dwc3/Kconfig @@ -1,6 +1,6 @@ config USB_DWC3 tristate "DesignWare USB3 DRD Core Support" - depends on (USB || USB_GADGET) && GENERIC_HARDIRQS && HAS_DMA + depends on (USB || USB_GADGET) && HAS_DMA depends on EXTCON select USB_XHCI_PLATFORM if USB_SUPPORT && USB_XHCI_HCD help diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 30e2dd8a1f2c..48cddf3cd6b8 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -313,7 +313,7 @@ config USB_S3C_HSUDC config USB_MV_UDC tristate "Marvell USB2.0 Device Controller" - depends on GENERIC_HARDIRQS && HAS_DMA + depends on HAS_DMA help Marvell Socs (including PXA and MMP series) include a high speed USB2.0 OTG controller, which can be configured as high speed or @@ -425,7 +425,7 @@ config USB_GOKU config USB_EG20T tristate "Intel EG20T PCH/LAPIS Semiconductor IOH(ML7213/ML7831) UDC" - depends on PCI && GENERIC_HARDIRQS + depends on PCI help This is a USB device driver for EG20T PCH. EG20T PCH is the platform controller hub that is used in Intel's diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index 5be0326aae38..b3f20d7f15de 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -278,7 +278,6 @@ endif # USB_EHCI_HCD config USB_OXU210HP_HCD tristate "OXU210HP HCD support" - depends on GENERIC_HARDIRQS ---help--- The OXU210HP is an USB host/OTG/device controller. Enable this option if your board has this chip. If unsure, say N. diff --git a/drivers/usb/musb/Kconfig b/drivers/usb/musb/Kconfig index c64ee09a7c0e..c258a97ef1b0 100644 --- a/drivers/usb/musb/Kconfig +++ b/drivers/usb/musb/Kconfig @@ -71,7 +71,6 @@ config USB_MUSB_DA8XX config USB_MUSB_TUSB6010 tristate "TUSB6010" - depends on GENERIC_HARDIRQS config USB_MUSB_OMAP2PLUS tristate "OMAP2430 and onwards" diff --git a/drivers/usb/renesas_usbhs/Kconfig b/drivers/usb/renesas_usbhs/Kconfig index 019bf7e49ee6..1c4195abc108 100644 --- a/drivers/usb/renesas_usbhs/Kconfig +++ b/drivers/usb/renesas_usbhs/Kconfig @@ -4,7 +4,7 @@ config USB_RENESAS_USBHS tristate 'Renesas USBHS controller' - depends on USB_GADGET && GENERIC_HARDIRQS + depends on USB_GADGET default n help Renesas USBHS is a discrete USB host and peripheral controller chip diff --git a/drivers/w1/masters/Kconfig b/drivers/w1/masters/Kconfig index 2bd1257dcc1c..efc7f075fcbe 100644 --- a/drivers/w1/masters/Kconfig +++ b/drivers/w1/masters/Kconfig @@ -42,7 +42,7 @@ config W1_MASTER_MXC config W1_MASTER_DS1WM tristate "Maxim DS1WM 1-wire busmaster" - depends on W1 && GENERIC_HARDIRQS + depends on W1 help Say Y here to enable the DS1WM 1-wire driver, such as that in HP iPAQ devices like h5xxx, h2200, and ASIC3-based like diff --git a/include/linux/cpu_rmap.h b/include/linux/cpu_rmap.h index 1739510d8994..bdd18caa6c94 100644 --- a/include/linux/cpu_rmap.h +++ b/include/linux/cpu_rmap.h @@ -52,8 +52,6 @@ static inline void *cpu_rmap_lookup_obj(struct cpu_rmap *rmap, unsigned int cpu) return rmap->obj[rmap->near[cpu].index]; } -#ifdef CONFIG_GENERIC_HARDIRQS - /** * alloc_irq_cpu_rmap - allocate CPU affinity reverse-map for IRQs * @size: Number of objects to be mapped @@ -68,5 +66,4 @@ extern void free_irq_cpu_rmap(struct cpu_rmap *rmap); extern int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq); -#endif #endif /* __LINUX_CPU_RMAP_H */ diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index ccfe17c5c8da..1e041063b226 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -7,11 +7,7 @@ #include -#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS) extern void synchronize_irq(unsigned int irq); -#else -# define synchronize_irq(irq) barrier() -#endif #if defined(CONFIG_TINY_RCU) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 5fa5afeeb759..5e865b554940 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -120,7 +120,6 @@ struct irqaction { extern irqreturn_t no_action(int cpl, void *dev_id); -#ifdef CONFIG_GENERIC_HARDIRQS extern int __must_check request_threaded_irq(unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, @@ -140,40 +139,6 @@ request_any_context_irq(unsigned int irq, irq_handler_t handler, extern int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id); -#else - -extern int __must_check -request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, - const char *name, void *dev); - -/* - * Special function to avoid ifdeffery in kernel/irq/devres.c which - * gets magically built by GENERIC_HARDIRQS=n architectures (sparc, - * m68k). I really love these $@%#!* obvious Makefile references: - * ../../../kernel/irq/devres.o - */ -static inline int __must_check -request_threaded_irq(unsigned int irq, irq_handler_t handler, - irq_handler_t thread_fn, - unsigned long flags, const char *name, void *dev) -{ - return request_irq(irq, handler, flags, name, dev); -} - -static inline int __must_check -request_any_context_irq(unsigned int irq, irq_handler_t handler, - unsigned long flags, const char *name, void *dev_id) -{ - return request_irq(irq, handler, flags, name, dev_id); -} - -static inline int __must_check -request_percpu_irq(unsigned int irq, irq_handler_t handler, - const char *devname, void __percpu *percpu_dev_id) -{ - return request_irq(irq, handler, 0, devname, percpu_dev_id); -} -#endif extern void free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); @@ -221,7 +186,6 @@ extern void enable_irq(unsigned int irq); extern void enable_percpu_irq(unsigned int irq, unsigned int type); /* The following three functions are for the core kernel use only. */ -#ifdef CONFIG_GENERIC_HARDIRQS extern void suspend_device_irqs(void); extern void resume_device_irqs(void); #ifdef CONFIG_PM_SLEEP @@ -229,13 +193,8 @@ extern int check_wakeup_irqs(void); #else static inline int check_wakeup_irqs(void) { return 0; } #endif -#else -static inline void suspend_device_irqs(void) { }; -static inline void resume_device_irqs(void) { }; -static inline int check_wakeup_irqs(void) { return 0; } -#endif -#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS) +#if defined(CONFIG_SMP) extern cpumask_var_t irq_default_affinity; @@ -287,9 +246,8 @@ static inline int irq_set_affinity_hint(unsigned int irq, { return -EINVAL; } -#endif /* CONFIG_SMP && CONFIG_GENERIC_HARDIRQS */ +#endif /* CONFIG_SMP */ -#ifdef CONFIG_GENERIC_HARDIRQS /* * Special lockdep variants of irq disabling/enabling. * These should be used for locking constructs that @@ -354,33 +312,6 @@ static inline int disable_irq_wake(unsigned int irq) return irq_set_irq_wake(irq, 0); } -#else /* !CONFIG_GENERIC_HARDIRQS */ -/* - * NOTE: non-genirq architectures, if they want to support the lock - * validator need to define the methods below in their asm/irq.h - * files, under an #ifdef CONFIG_LOCKDEP section. - */ -#ifndef CONFIG_LOCKDEP -# define disable_irq_nosync_lockdep(irq) disable_irq_nosync(irq) -# define disable_irq_nosync_lockdep_irqsave(irq, flags) \ - disable_irq_nosync(irq) -# define disable_irq_lockdep(irq) disable_irq(irq) -# define enable_irq_lockdep(irq) enable_irq(irq) -# define enable_irq_lockdep_irqrestore(irq, flags) \ - enable_irq(irq) -# endif - -static inline int enable_irq_wake(unsigned int irq) -{ - return 0; -} - -static inline int disable_irq_wake(unsigned int irq) -{ - return 0; -} -#endif /* CONFIG_GENERIC_HARDIRQS */ - #ifdef CONFIG_IRQ_FORCED_THREADING extern bool force_irqthreads; @@ -655,7 +586,7 @@ void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer) * if more than one irq occurred. */ -#if defined(CONFIG_GENERIC_HARDIRQS) && !defined(CONFIG_GENERIC_IRQ_PROBE) +#if !defined(CONFIG_GENERIC_IRQ_PROBE) static inline unsigned long probe_irq_on(void) { return 0; diff --git a/include/linux/irq.h b/include/linux/irq.h index f04d3ba335cb..56bb0dc8b7d4 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -382,8 +382,6 @@ extern void irq_cpu_online(void); extern void irq_cpu_offline(void); extern int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *cpumask); -#ifdef CONFIG_GENERIC_HARDIRQS - #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_PENDING_IRQ) void irq_move_irq(struct irq_data *data); void irq_move_masked_irq(struct irq_data *data); @@ -802,11 +800,4 @@ static inline void irq_gc_lock(struct irq_chip_generic *gc) { } static inline void irq_gc_unlock(struct irq_chip_generic *gc) { } #endif -#else /* !CONFIG_GENERIC_HARDIRQS */ - -extern struct msi_desc *irq_get_msi_desc(unsigned int irq); -extern int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry); - -#endif /* CONFIG_GENERIC_HARDIRQS */ - #endif /* _LINUX_IRQ_H */ diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index 623325e2ff97..56fb646909dc 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -76,8 +76,6 @@ struct irq_desc { extern struct irq_desc irq_desc[NR_IRQS]; #endif -#ifdef CONFIG_GENERIC_HARDIRQS - static inline struct irq_data *irq_desc_get_irq_data(struct irq_desc *desc) { return &desc->irq_data; @@ -173,6 +171,5 @@ __irq_set_preflow_handler(unsigned int irq, irq_preflow_handler_t handler) desc->preflow_handler = handler; } #endif -#endif #endif diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 0a2dc46cdaf6..fdd5cc16c9c4 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -4,23 +4,6 @@ #include -#ifndef CONFIG_GENERIC_HARDIRQS -#include - -/* - * Wrappers for non-genirq architectures: - */ -#define nr_irqs NR_IRQS -#define irq_to_desc(irq) (&irq_desc[irq]) - -# define for_each_irq_desc(irq, desc) \ - for (irq = 0; irq < nr_irqs; irq++) - -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1; irq >= 0; irq--) - -#else /* CONFIG_GENERIC_HARDIRQS */ - extern int nr_irqs; extern struct irq_desc *irq_to_desc(unsigned int irq); unsigned int irq_get_next_irq(unsigned int offset); @@ -50,8 +33,6 @@ unsigned int irq_get_next_irq(unsigned int offset); for (irq = irq_get_next_irq(0); irq < nr_irqs; \ irq = irq_get_next_irq(irq + 1)) -#endif /* CONFIG_GENERIC_HARDIRQS */ - #define for_each_irq_nr(irq) \ for (irq = 0; irq < nr_irqs; irq++) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index ed5f6ed6eb77..51c72be4a7c3 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -36,9 +36,6 @@ struct kernel_cpustat { }; struct kernel_stat { -#ifndef CONFIG_GENERIC_HARDIRQS - unsigned int irqs[NR_IRQS]; -#endif unsigned long irqs_sum; unsigned int softirqs[NR_SOFTIRQS]; }; @@ -54,22 +51,6 @@ DECLARE_PER_CPU(struct kernel_cpustat, kernel_cpustat); extern unsigned long long nr_context_switches(void); -#ifndef CONFIG_GENERIC_HARDIRQS - -struct irq_desc; - -static inline void kstat_incr_irqs_this_cpu(unsigned int irq, - struct irq_desc *desc) -{ - __this_cpu_inc(kstat.irqs[irq]); - __this_cpu_inc(kstat.irqs_sum); -} - -static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) -{ - return kstat_cpu(cpu).irqs[irq]; -} -#else #include extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); @@ -79,8 +60,6 @@ do { \ __this_cpu_inc(kstat.irqs_sum); \ } while (0) -#endif - static inline void kstat_incr_softirqs_this_cpu(unsigned int irq) { __this_cpu_inc(kstat.softirqs[irq]); @@ -94,20 +73,7 @@ static inline unsigned int kstat_softirqs_cpu(unsigned int irq, int cpu) /* * Number of interrupts per specific IRQ source, since bootup */ -#ifndef CONFIG_GENERIC_HARDIRQS -static inline unsigned int kstat_irqs(unsigned int irq) -{ - unsigned int sum = 0; - int cpu; - - for_each_possible_cpu(cpu) - sum += kstat_irqs_cpu(irq, cpu); - - return sum; -} -#else extern unsigned int kstat_irqs(unsigned int irq); -#endif /* * Number of interrupts per cpu, since bootup diff --git a/kernel/Makefile b/kernel/Makefile index 35ef1185e359..1ce47553fb02 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -26,6 +26,7 @@ obj-y += sched/ obj-y += power/ obj-y += printk/ obj-y += cpu/ +obj-y += irq/ obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o obj-$(CONFIG_FREEZER) += freezer.o @@ -79,7 +80,6 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o -obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_TREE_RCU) += rcutree.o diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index d1a758bc972a..4a1fef09f658 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -1,15 +1,4 @@ -# Select this to activate the generic irq options below -config HAVE_GENERIC_HARDIRQS - bool - -if HAVE_GENERIC_HARDIRQS menu "IRQ subsystem" -# -# Interrupt subsystem related configuration options -# -config GENERIC_HARDIRQS - def_bool y - # Options selectable by the architecture code # Make sparse irq Kconfig switch below available @@ -84,4 +73,3 @@ config SPARSE_IRQ If you don't know what to do here, say N. endmenu -endif diff --git a/kernel/softirq.c b/kernel/softirq.c index be3d3514c325..53cc09ceb0b8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -876,7 +876,6 @@ int __init __weak early_irq_init(void) return 0; } -#ifdef CONFIG_GENERIC_HARDIRQS int __init __weak arch_probe_nr_irqs(void) { return NR_IRQS_LEGACY; @@ -886,4 +885,3 @@ int __init __weak arch_early_irq_init(void) { return 0; } -#endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c9eef36739a9..06344d986eb9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -597,7 +597,7 @@ endmenu # "Memory Debugging" config DEBUG_SHIRQ bool "Debug shared IRQ handlers" - depends on DEBUG_KERNEL && GENERIC_HARDIRQS + depends on DEBUG_KERNEL help Enable this to generate a spurious interrupt as soon as a shared interrupt handler is registered, and just before one is deregistered. diff --git a/lib/cpu_rmap.c b/lib/cpu_rmap.c index 5fbed5caba6e..4f134d8907a7 100644 --- a/lib/cpu_rmap.c +++ b/lib/cpu_rmap.c @@ -8,9 +8,7 @@ */ #include -#ifdef CONFIG_GENERIC_HARDIRQS #include -#endif #include /* @@ -213,8 +211,6 @@ int cpu_rmap_update(struct cpu_rmap *rmap, u16 index, } EXPORT_SYMBOL(cpu_rmap_update); -#ifdef CONFIG_GENERIC_HARDIRQS - /* Glue between IRQ affinity notifiers and CPU rmaps */ struct irq_glue { @@ -309,5 +305,3 @@ int irq_cpu_rmap_add(struct cpu_rmap *rmap, int irq) return rc; } EXPORT_SYMBOL(irq_cpu_rmap_add); - -#endif /* CONFIG_GENERIC_HARDIRQS */ diff --git a/net/Kconfig b/net/Kconfig index ee0213667272..b50dacc072f0 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -228,7 +228,7 @@ config RPS config RFS_ACCEL boolean - depends on RPS && GENERIC_HARDIRQS + depends on RPS select CPU_RMAP default y diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 15106c045478..b33b45dfceec 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -107,7 +107,7 @@ config SND_SOC_ALL_CODECS select SND_SOC_WM8782 select SND_SOC_WM8804 if SND_SOC_I2C_AND_SPI select SND_SOC_WM8900 if I2C - select SND_SOC_WM8903 if I2C && GENERIC_HARDIRQS + select SND_SOC_WM8903 if I2C select SND_SOC_WM8904 if I2C select SND_SOC_WM8940 if I2C select SND_SOC_WM8955 if I2C diff --git a/sound/soc/samsung/Kconfig b/sound/soc/samsung/Kconfig index 9855dfc3e3ec..2eea1840315d 100644 --- a/sound/soc/samsung/Kconfig +++ b/sound/soc/samsung/Kconfig @@ -63,7 +63,7 @@ config SND_SOC_SAMSUNG_SMDK_WM8580 config SND_SOC_SAMSUNG_SMDK_WM8994 tristate "SoC I2S Audio support for WM8994 on SMDK" depends on SND_SOC_SAMSUNG - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_WM8994 select SND_SOC_WM8994 select SND_SAMSUNG_I2S @@ -151,7 +151,7 @@ config SND_SOC_SMARTQ config SND_SOC_GONI_AQUILA_WM8994 tristate "SoC I2S Audio support for AQUILA/GONI - WM8994" depends on SND_SOC_SAMSUNG && (MACH_GONI || MACH_AQUILA) - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select SND_SAMSUNG_I2S select MFD_WM8994 select SND_SOC_WM8994 @@ -177,7 +177,7 @@ config SND_SOC_SMDK_WM8580_PCM config SND_SOC_SMDK_WM8994_PCM tristate "SoC PCM Audio support for WM8994 on SMDK" depends on SND_SOC_SAMSUNG - depends on I2C=y && GENERIC_HARDIRQS + depends on I2C=y select MFD_WM8994 select SND_SOC_WM8994 select SND_SAMSUNG_PCM -- cgit v1.3-8-gc7d7 From 13b62e46d5407c7d619aea1dc9c3e0991b631b57 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Mon, 16 Sep 2013 11:30:36 +0300 Subject: sched: Fix comment for sched_info_depart sched_info_depart seems to be only called from sched_info_switch(), so only on involuntary task switch. Fix the comment to match. Signed-off-by: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: KOSAKI Motohiro Link: http://lkml.kernel.org/r/20130916083036.GA1113@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/stats.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 5aef494fc8b4..c7edee71bce8 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -104,8 +104,9 @@ static inline void sched_info_queued(struct task_struct *t) } /* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. + * Called when a process ceases being the active-running process involuntarily + * due, typically, to expiring its time slice (this may also be called when + * switching to the idle task). Now we can calculate how long we ran. * Also, if the process is still in the TASK_RUNNING state, call * sched_info_queued() to mark that it has now again started waiting on * the runqueue. -- cgit v1.3-8-gc7d7 From fa7315871046b9a4c48627905691dbde57e51033 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Sep 2013 10:16:42 +0200 Subject: perf: Fix capabilities bitfield compatibility in 'struct perf_event_mmap_page' Solve the problems around the broken definition of perf_event_mmap_page:: cap_usr_time and cap_usr_rdpmc fields which used to overlap, partially fixed by: 860f085b74e9 ("perf: Fix broken union in 'struct perf_event_mmap_page'") The problem with the fix (merged in v3.12-rc1 and not yet released officially), noticed by Vince Weaver is that the new behavior is not detectable by new user-space, and that due to the reuse of the field names it's easy to mis-compile a binary if old headers are used on a new kernel or new headers are used on an old kernel. To solve all that make this change explicit, detectable and self-contained, by iterating the ABI the following way: - Always clear bit 0, and rename it to usrpage->cap_bit0, to at least not confuse old user-space binaries. RDPMC will be marked as unavailable to old binaries but that's within the ABI, this is a capability bit. - Rename bit 1 to ->cap_bit0_is_deprecated and always set it to 1, so new libraries can reliably detect that bit 0 is deprecated and perma-zero without having to check the kernel version. - Use bits 2, 3, 4 for the newly defined, correct functionality: cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ cap_user_time : 1, /* The time_* fields are used */ cap_user_time_zero : 1, /* The time_zero field is used */ - Rename all the bitfield names in perf_event.h to be different from the old names, to make sure it's not possible to mis-compile it accidentally with old assumptions. The 'size' field can then be used in the future to add new fields and it will act as a natural ABI version indicator as well. Also adjust tools/perf/ userspace for the new definitions, noticed by Adrian Hunter. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Also-Fixed-by: Adrian Hunter Link: http://lkml.kernel.org/n/tip-zr03yxjrpXesOzzupszqglbv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++----- include/uapi/linux/perf_event.h | 14 +++++++++----- kernel/events/core.c | 21 +++++++++++++++++++++ tools/perf/arch/x86/util/tsc.c | 6 +++--- 4 files changed, 38 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8355c84b9729..a9c606bb4945 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1883,9 +1883,9 @@ static struct pmu pmu = { void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { - userpg->cap_usr_time = 0; - userpg->cap_usr_time_zero = 0; - userpg->cap_usr_rdpmc = x86_pmu.attr_rdpmc; + userpg->cap_user_time = 0; + userpg->cap_user_time_zero = 0; + userpg->cap_user_rdpmc = x86_pmu.attr_rdpmc; userpg->pmc_width = x86_pmu.cntval_bits; if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) @@ -1894,13 +1894,13 @@ void arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) return; - userpg->cap_usr_time = 1; + userpg->cap_user_time = 1; userpg->time_mult = this_cpu_read(cyc2ns); userpg->time_shift = CYC2NS_SCALE_FACTOR; userpg->time_offset = this_cpu_read(cyc2ns_offset) - now; if (sched_clock_stable && !check_tsc_disabled()) { - userpg->cap_usr_time_zero = 1; + userpg->cap_user_time_zero = 1; userpg->time_zero = this_cpu_read(cyc2ns_offset); } } diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 7f6d584c267b..009a655a5d35 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -380,10 +380,13 @@ struct perf_event_mmap_page { union { __u64 capabilities; struct { - __u64 cap_usr_time : 1, - cap_usr_rdpmc : 1, - cap_usr_time_zero : 1, - cap_____res : 61; + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_* fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_____res : 59; }; }; @@ -442,12 +445,13 @@ struct perf_event_mmap_page { * ((rem * time_mult) >> time_shift); */ __u64 time_zero; + __u32 size; /* Header size up to __reserved[] fields. */ /* * Hole for extension of the self monitor capabilities */ - __u64 __reserved[119]; /* align to 1k */ + __u8 __reserved[118*8+4]; /* align to 1k. */ /* * Control data for the mmap() data buffer. diff --git a/kernel/events/core.c b/kernel/events/core.c index dd236b66ca3a..cb4238e85b38 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3660,6 +3660,26 @@ static void calc_timer_values(struct perf_event *event, *running = ctx_time - event->tstamp_running; } +static void perf_event_init_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct ring_buffer *rb; + + rcu_read_lock(); + rb = rcu_dereference(event->rb); + if (!rb) + goto unlock; + + userpg = rb->user_page; + + /* Allow new userspace to detect that bit 0 is deprecated */ + userpg->cap_bit0_is_deprecated = 1; + userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + +unlock: + rcu_read_unlock(); +} + void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now) { } @@ -4044,6 +4064,7 @@ again: ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); + perf_event_init_userpage(event); perf_event_update_userpage(event); unlock: diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c index 9570c2b0f83c..b2519e49424f 100644 --- a/tools/perf/arch/x86/util/tsc.c +++ b/tools/perf/arch/x86/util/tsc.c @@ -32,7 +32,7 @@ u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc) int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, struct perf_tsc_conversion *tc) { - bool cap_usr_time_zero; + bool cap_user_time_zero; u32 seq; int i = 0; @@ -42,7 +42,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, tc->time_mult = pc->time_mult; tc->time_shift = pc->time_shift; tc->time_zero = pc->time_zero; - cap_usr_time_zero = pc->cap_usr_time_zero; + cap_user_time_zero = pc->cap_user_time_zero; rmb(); if (pc->lock == seq && !(seq & 1)) break; @@ -52,7 +52,7 @@ int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc, } } - if (!cap_usr_time_zero) + if (!cap_user_time_zero) return -EOPNOTSUPP; return 0; -- cgit v1.3-8-gc7d7 From b18855500fc40da050512d9df82d2f1471e59642 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 17:49:13 +0400 Subject: sched/balancing: Fix 'local->avg_load > sds->avg_load' case in calculate_imbalance() In busiest->group_imb case we can come to calculate_imbalance() with local->avg_load >= busiest->avg_load >= sds->avg_load. This can result in imbalance overflow, because it is calculated as follows env->imbalance = min( max_pull * busiest->group_power, (sds->avg_load - local->avg_load) * local->group_power) / SCHED_POWER_SCALE; As a result we can end up constantly bouncing tasks from one cpu to another if there are pinned tasks. Fix this by skipping the assignment and assuming imbalance=0 in case local->avg_load > sds->avg_load. [ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus belonging to different cores on an HT-enabled machine with N logical cpus: just look at se.nr_migrations growth. ] Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/8f596cc6bc0e5e655119dc892c9bfcad26e971f4.1379252740.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 11cd13667359..0b99aae339cb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4896,7 +4896,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * max load less than avg load(as we skip the groups at or below * its cpu_power, while calculating max_load..) */ - if (busiest->avg_load < sds->avg_load) { + if (busiest->avg_load <= sds->avg_load || + local->avg_load >= sds->avg_load) { env->imbalance = 0; return fix_small_imbalance(env, sds); } -- cgit v1.3-8-gc7d7 From 3029ede39373c368f402a76896600d85a4f7121b Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sun, 15 Sep 2013 17:49:14 +0400 Subject: sched/balancing: Fix 'local->avg_load > busiest->avg_load' case in fix_small_imbalance() In busiest->group_imb case we can come to fix_small_imbalance() with local->avg_load > busiest->avg_load. This can result in wrong imbalance fix-up, because there is the following check there where all the members are unsigned: if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } As a result we can end up constantly bouncing tasks from one cpu to another if there are pinned tasks. Fix it by substituting the subtraction with an equivalent addition in the check. [ The bug can be caught by running 2*N cpuhogs pinned to two logical cpus belonging to different cores on an HT-enabled machine with N logical cpus: just look at se.nr_migrations growth. ] Signed-off-by: Vladimir Davydov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/ef167822e5c5b2d96cf5b0e3e4f4bdff3f0414a2.1379252740.git.vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0b99aae339cb..2aedaccebcc8 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4823,8 +4823,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) (busiest->load_per_task * SCHED_POWER_SCALE) / busiest->group_power; - if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >= - (scaled_busy_load_per_task * imbn)) { + if (busiest->avg_load + scaled_busy_load_per_task >= + local->avg_load + (scaled_busy_load_per_task * imbn)) { env->imbalance = busiest->load_per_task; return; } -- cgit v1.3-8-gc7d7 From 7e3115ef5149fc502e3a2e80719dba54a8e7409d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Sat, 14 Sep 2013 19:39:46 +0400 Subject: sched/balancing: Fix cfs_rq->task_h_load calculation Patch a003a2 (sched: Consider runnable load average in move_tasks()) sets all top-level cfs_rqs' h_load to rq->avg.load_avg_contrib, which is always 0. This mistype leads to all tasks having weight 0 when load balancing in a cpu-cgroup enabled setup. There obviously should be sum of weights of all runnable tasks there instead. Fix it. Signed-off-by: Vladimir Davydov Reviewed-by: Paul Turner Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1379173186-11944-1-git-send-email-vdavydov@parallels.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2aedaccebcc8..7c70201fbc61 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4242,7 +4242,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) } if (!se) { - cfs_rq->h_load = rq->avg.load_avg_contrib; + cfs_rq->h_load = cfs_rq->runnable_load_avg; cfs_rq->last_h_load_update = now; } -- cgit v1.3-8-gc7d7 From 359e6fab6600562073162348cd4c18c5958296d8 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 Sep 2013 15:27:29 -0700 Subject: watchdog: update watchdog attributes atomically proc_dowatchdog doesn't synchronize multiple callers which might lead to confusion when two parallel callers might confuse watchdog_enable_all_cpus resp watchdog_disable_all_cpus (eg watchdog gets enabled even if watchdog_thresh was set to 0 already). This patch adds a local mutex which synchronizes callers to the sysctl handler. Signed-off-by: Michal Hocko Cc: Frederic Weisbecker Acked-by: Don Zickus Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/watchdog.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 51c4f34d258e..ced7d0609931 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -520,13 +520,15 @@ int proc_dowatchdog(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int err, old_thresh, old_enabled; + static DEFINE_MUTEX(watchdog_proc_mutex); + mutex_lock(&watchdog_proc_mutex); old_thresh = ACCESS_ONCE(watchdog_thresh); old_enabled = ACCESS_ONCE(watchdog_user_enabled); err = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (err || !write) - return err; + goto out; set_sample_period(); /* @@ -544,7 +546,8 @@ int proc_dowatchdog(struct ctl_table *table, int write, watchdog_thresh = old_thresh; watchdog_user_enabled = old_enabled; } - +out: + mutex_unlock(&watchdog_proc_mutex); return err; } #endif /* CONFIG_SYSCTL */ -- cgit v1.3-8-gc7d7 From 9809b18fcf6b8d8ec4d3643677345907e6b50eca Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 24 Sep 2013 15:27:30 -0700 Subject: watchdog: update watchdog_thresh properly watchdog_tresh controls how often nmi perf event counter checks per-cpu hrtimer_interrupts counter and blows up if the counter hasn't changed since the last check. The counter is updated by per-cpu watchdog_hrtimer hrtimer which is scheduled with 2/5 watchdog_thresh period which guarantees that hrtimer is scheduled 2 times per the main period. Both hrtimer and perf event are started together when the watchdog is enabled. So far so good. But... But what happens when watchdog_thresh is updated from sysctl handler? proc_dowatchdog will set a new sampling period and hrtimer callback (watchdog_timer_fn) will use the new value in the next round. The problem, however, is that nobody tells the perf event that the sampling period has changed so it is ticking with the period configured when it has been set up. This might result in an ear ripping dissonance between perf and hrtimer parts if the watchdog_thresh is increased. And even worse it might lead to KABOOM if the watchdog is configured to panic on such a spurious lockup. This patch fixes the issue by updating both nmi perf even counter and hrtimers if the threshold value has changed. The nmi one is disabled and then reinitialized from scratch. This has an unpleasant side effect that the allocation of the new event might fail theoretically so the hard lockup detector would be disabled for such cpus. On the other hand such a memory allocation failure is very unlikely because the original event is deallocated right before. It would be much nicer if we just changed perf event period but there doesn't seem to be any API to do that right now. It is also unfortunate that perf_event_alloc uses GFP_KERNEL allocation unconditionally so we cannot use on_each_cpu() and do the same thing from the per-cpu context. The update from the current CPU should be safe because perf_event_disable removes the event atomically before it clears the per-cpu watchdog_ev so it cannot change anything under running handler feet. The hrtimer is simply restarted (thanks to Don Zickus who has pointed this out) if it is queued because we cannot rely it will fire&adopt to the new sampling period before a new nmi event triggers (when the treshold is decreased). [akpm@linux-foundation.org: the UP version of __smp_call_function_single ended up in the wrong place] Signed-off-by: Michal Hocko Acked-by: Don Zickus Cc: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Fabio Estevam Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 6 ++++++ kernel/watchdog.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 56 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/smp.h b/include/linux/smp.h index cfb7ca094b38..731f5237d5f4 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -155,6 +155,12 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } +static inline void __smp_call_function_single(int cpuid, + struct call_single_data *data, int wait) +{ + on_each_cpu(data->func, data->info, wait); +} + #endif /* !SMP */ /* diff --git a/kernel/watchdog.c b/kernel/watchdog.c index ced7d0609931..4431610f049a 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -486,7 +486,52 @@ static struct smp_hotplug_thread watchdog_threads = { .unpark = watchdog_enable, }; -static int watchdog_enable_all_cpus(void) +static void restart_watchdog_hrtimer(void *info) +{ + struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer); + int ret; + + /* + * No need to cancel and restart hrtimer if it is currently executing + * because it will reprogram itself with the new period now. + * We should never see it unqueued here because we are running per-cpu + * with interrupts disabled. + */ + ret = hrtimer_try_to_cancel(hrtimer); + if (ret == 1) + hrtimer_start(hrtimer, ns_to_ktime(sample_period), + HRTIMER_MODE_REL_PINNED); +} + +static void update_timers(int cpu) +{ + struct call_single_data data = {.func = restart_watchdog_hrtimer}; + /* + * Make sure that perf event counter will adopt to a new + * sampling period. Updating the sampling period directly would + * be much nicer but we do not have an API for that now so + * let's use a big hammer. + * Hrtimer will adopt the new period on the next tick but this + * might be late already so we have to restart the timer as well. + */ + watchdog_nmi_disable(cpu); + __smp_call_function_single(cpu, &data, 1); + watchdog_nmi_enable(cpu); +} + +static void update_timers_all_cpus(void) +{ + int cpu; + + get_online_cpus(); + preempt_disable(); + for_each_online_cpu(cpu) + update_timers(cpu); + preempt_enable(); + put_online_cpus(); +} + +static int watchdog_enable_all_cpus(bool sample_period_changed) { int err = 0; @@ -496,6 +541,8 @@ static int watchdog_enable_all_cpus(void) pr_err("Failed to create watchdog threads, disabled\n"); else watchdog_running = 1; + } else if (sample_period_changed) { + update_timers_all_cpus(); } return err; @@ -537,7 +584,7 @@ int proc_dowatchdog(struct ctl_table *table, int write, * watchdog_*_all_cpus() function takes care of this. */ if (watchdog_user_enabled && watchdog_thresh) - err = watchdog_enable_all_cpus(); + err = watchdog_enable_all_cpus(old_thresh != watchdog_thresh); else watchdog_disable_all_cpus(); @@ -557,5 +604,5 @@ void __init lockup_detector_init(void) set_sample_period(); if (watchdog_user_enabled) - watchdog_enable_all_cpus(); + watchdog_enable_all_cpus(false); } -- cgit v1.3-8-gc7d7 From 8ac1c8d5deba65513b6a82c35e89e73996c8e0d6 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 24 Sep 2013 15:27:42 -0700 Subject: audit: fix endless wait in audit_log_start() After commit 829199197a43 ("kernel/audit.c: avoid negative sleep durations") audit emitters will block forever if userspace daemon cannot handle backlog. After the timeout the waiting loop turns into busy loop and runs until daemon dies or returns back to work. This is a minimal patch for that bug. Signed-off-by: Konstantin Khlebnikov Cc: Luiz Capitulino Cc: Richard Guy Briggs Cc: Eric Paris Cc: Chuck Anderson Cc: Dan Duval Cc: Dave Kleikamp Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/audit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 91e53d04b6a9..7b0e23a740ce 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -1117,9 +1117,10 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, sleep_time = timeout_start + audit_backlog_wait_time - jiffies; - if ((long)sleep_time > 0) + if ((long)sleep_time > 0) { wait_for_auditd(sleep_time); - continue; + continue; + } } if (audit_rate_check() && printk_ratelimit()) printk(KERN_WARNING -- cgit v1.3-8-gc7d7 From e2f0b88e84eed9cf9797f0a88c8012ee0b885a6d Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Tue, 24 Sep 2013 15:27:43 -0700 Subject: kernel/reboot.c: re-enable the function of variable reboot_default Commit 1b3a5d02ee07 ("reboot: move arch/x86 reboot= handling to generic kernel") did some cleanup for reboot= command line, but it made the reboot_default inoperative. The default value of variable reboot_default should be 1, and if command line reboot= is not set, system will use the default reboot mode. [akpm@linux-foundation.org: fix comment layout] Signed-off-by: Li Fei Signed-off-by: liu chuansheng Acked-by: Robin Holt Cc: [3.11.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/reboot.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/reboot.c b/kernel/reboot.c index 269ed9384cc4..f813b3474646 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -32,7 +32,14 @@ EXPORT_SYMBOL(cad_pid); #endif enum reboot_mode reboot_mode DEFAULT_REBOOT_MODE; -int reboot_default; +/* + * This variable is used privately to keep track of whether or not + * reboot_type is still set to its default value (i.e., reboot= hasn't + * been set on the command line). This is needed so that we can + * suppress DMI scanning for reboot quirks. Without it, it's + * impossible to override a faulty reboot quirk without recompiling. + */ +int reboot_default = 1; int reboot_cpu; enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; -- cgit v1.3-8-gc7d7 From 0c06a5d4b13cd66c833805a0d1db76b977944aac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 10 Sep 2013 00:54:17 +0200 Subject: arm: Fix build error with context tracking calls ad65782fba50 (context_tracking: Optimize main APIs off case with static key) converted context tracking main APIs to inline function and left ARM asm callers behind. This can be easily fixed by making ARM calling the post static keys context tracking function. We just need to replicate the static key checks there. We'll remove these later when ARM will support the context tracking static keys. Reported-by: Guenter Roeck Reported-by: Russell King Signed-off-by: Frederic Weisbecker Tested-by: Kevin Hilman Cc: Nicolas Pitre Cc: Anil Kumar Cc: Tony Lindgren Cc: Benoit Cousson Cc: Guenter Roeck Cc: Russell King Cc: Kevin Hilman --- arch/arm/kernel/entry-header.S | 8 ++++---- kernel/context_tracking.c | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index de23a9beed13..39f89fbd5111 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -329,10 +329,10 @@ #ifdef CONFIG_CONTEXT_TRACKING .if \save stmdb sp!, {r0-r3, ip, lr} - bl user_exit + bl context_tracking_user_exit ldmia sp!, {r0-r3, ip, lr} .else - bl user_exit + bl context_tracking_user_exit .endif #endif .endm @@ -341,10 +341,10 @@ #ifdef CONFIG_CONTEXT_TRACKING .if \save stmdb sp!, {r0-r3, ip, lr} - bl user_enter + bl context_tracking_user_enter ldmia sp!, {r0-r3, ip, lr} .else - bl user_enter + bl context_tracking_user_enter .endif #endif .endm diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 247091bf0587..859c8dfd78a1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -50,6 +50,15 @@ void context_tracking_user_enter(void) { unsigned long flags; + /* + * Repeat the user_enter() check here because some archs may be calling + * this from asm and if no CPU needs context tracking, they shouldn't + * go further. Repeat the check here until they support the static key + * check. + */ + if (!static_key_false(&context_tracking_enabled)) + return; + /* * Some contexts may involve an exception occuring in an irq, * leading to that nesting: @@ -151,6 +160,9 @@ void context_tracking_user_exit(void) { unsigned long flags; + if (!static_key_false(&context_tracking_enabled)) + return; + if (in_interrupt()) return; -- cgit v1.3-8-gc7d7 From 3a126f85e015701e56240884f27f97543580d5f7 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Fri, 27 Sep 2013 13:17:39 -0700 Subject: kernel/params: fix handling of signed integer types Commit 6072ddc8520b ("kernel: replace strict_strto*() with kstrto*()") broke the handling of signed integer types, fix it. Signed-off-by: Jean Delvare Reported-by: Christian Kujau Tested-by: Christian Kujau Cc: Jingoo Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 81c4e78c8f4c..c00d5b502aa4 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -254,11 +254,11 @@ int parse_args(const char *doing, STANDARD_PARAM_DEF(byte, unsigned char, "%hhu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtoul); +STANDARD_PARAM_DEF(short, short, "%hi", long, kstrtol); STANDARD_PARAM_DEF(ushort, unsigned short, "%hu", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(int, int, "%i", long, kstrtoul); +STANDARD_PARAM_DEF(int, int, "%i", long, kstrtol); STANDARD_PARAM_DEF(uint, unsigned int, "%u", unsigned long, kstrtoul); -STANDARD_PARAM_DEF(long, long, "%li", long, kstrtoul); +STANDARD_PARAM_DEF(long, long, "%li", long, kstrtol); STANDARD_PARAM_DEF(ulong, unsigned long, "%lu", unsigned long, kstrtoul); int param_set_charp(const char *val, const struct kernel_param *kp) -- cgit v1.3-8-gc7d7 From aab1728915420b5288cd0fc7b5bd320105b48983 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 30 Sep 2013 19:40:56 +0200 Subject: PM / hibernate: Fix user space driven resume regression Recent commit 8fd37a4 (PM / hibernate: Create memory bitmaps after freezing user space) broke the resume part of the user space driven hibernation (s2disk), because I forgot that the resume utility loaded the image into memory without freezing user space (it still freezes tasks after loading the image). This means that during user space driven resume we need to create the memory bitmaps at the "device open" time rather than at the "freeze tasks" time, so make that happen (that's a special case anyway, so it needs to be treated in a special way). Reported-and-tested-by: Ronald Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 5 ++++- kernel/power/user.c | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 358a146fd4da..98c3b34a4cff 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -743,7 +743,10 @@ int create_basic_memory_bitmaps(void) struct memory_bitmap *bm1, *bm2; int error = 0; - BUG_ON(forbidden_pages_map || free_pages_map); + if (forbidden_pages_map && free_pages_map) + return 0; + else + BUG_ON(forbidden_pages_map || free_pages_map); bm1 = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL); if (!bm1) diff --git a/kernel/power/user.c b/kernel/power/user.c index 72e8f4fd616d..957f06164ad1 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -39,6 +39,7 @@ static struct snapshot_data { char frozen; char ready; char platform_support; + bool free_bitmaps; } snapshot_state; atomic_t snapshot_device_available = ATOMIC_INIT(1); @@ -82,6 +83,10 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain(PM_RESTORE_PREPARE); + if (!error) { + error = create_basic_memory_bitmaps(); + data->free_bitmaps = !error; + } if (error) pm_notifier_call_chain(PM_POST_RESTORE); } @@ -111,6 +116,8 @@ static int snapshot_release(struct inode *inode, struct file *filp) pm_restore_gfp_mask(); free_basic_memory_bitmaps(); thaw_processes(); + } else if (data->free_bitmaps) { + free_basic_memory_bitmaps(); } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); @@ -231,6 +238,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; pm_restore_gfp_mask(); free_basic_memory_bitmaps(); + data->free_bitmaps = false; thaw_processes(); data->frozen = 0; break; -- cgit v1.3-8-gc7d7 From 4c1c7be95c345cf2ad537a0c48e9aeadc7304527 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 30 Sep 2013 13:45:08 -0700 Subject: kernel/kmod.c: check for NULL in call_usermodehelper_exec() If /proc/sys/kernel/core_pattern contains only "|", a NULL pointer dereference happens upon core dump because argv_split("") returns argv[0] == NULL. This bug was once fixed by commit 264b83c07a84 ("usermodehelper: check subprocess_info->path != NULL") but was by error reintroduced by commit 7f57cfa4e2aa ("usermodehelper: kill the sub_info->path[0] check"). This bug seems to exist since 2.6.19 (the version which core dump to pipe was added). Depending on kernel version and config, some side effect might happen immediately after this oops (e.g. kernel panic with 2.6.32-358.18.1.el6). Signed-off-by: Tetsuo Handa Acked-by: Oleg Nesterov Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kmod.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kmod.c b/kernel/kmod.c index fb326365b694..b086006c59e7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -571,6 +571,10 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait) DECLARE_COMPLETION_ONSTACK(done); int retval = 0; + if (!sub_info->path) { + call_usermodehelper_freeinfo(sub_info); + return -EINVAL; + } helper_lock(); if (!khelper_wq || usermodehelper_disabled) { retval = -EBUSY; -- cgit v1.3-8-gc7d7 From 314a8ad0f18ac37887896b288939acd8cb17e208 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 30 Sep 2013 13:45:27 -0700 Subject: pidns: fix free_pid() to handle the first fork failure "case 0" in free_pid() assumes that disable_pid_allocation() should clear PIDNS_HASH_ADDING before the last pid goes away. However this doesn't happen if the first fork() fails to create the child reaper which should call disable_pid_allocation(). Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index ebe5e80b10f8..9b9a26698144 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -273,6 +273,11 @@ void free_pid(struct pid *pid) */ wake_up_process(ns->child_reaper); break; + case PIDNS_HASH_ADDING: + /* Handle a fork failure of the first process */ + WARN_ON(ns->child_reaper); + ns->nr_hashed = 0; + /* fall through */ case 0: schedule_work(&ns->proc_work); break; -- cgit v1.3-8-gc7d7 From ded797547548a5b8e7b92383a41e4c0e6b0ecb7f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 24 Sep 2013 00:50:25 +0200 Subject: irq: Force hardirq exit's softirq processing on its own stack The commit facd8b80c67a3cf64a467c4a2ac5fb31f2e6745b ("irq: Sanitize invoke_softirq") converted irq exit calls of do_softirq() to __do_softirq() on all architectures, assuming it was only used there for its irq disablement properties. But as a side effect, the softirqs processed in the end of the hardirq are always called on the inline current stack that is used by irq_exit() instead of the softirq stack provided by the archs that override do_softirq(). The result is mostly safe if the architecture runs irq_exit() on a separate irq stack because then softirqs are processed on that same stack that is near empty at this stage (assuming hardirq aren't nesting). Otherwise irq_exit() runs in the task stack and so does the softirq too. The interrupted call stack can be randomly deep already and the softirq can dig through it even further. To add insult to the injury, this softirq can be interrupted by a new hardirq, maximizing the chances for a stack overrun as reported in powerpc for example: do_IRQ: stack overflow: 1920 CPU: 0 PID: 1602 Comm: qemu-system-ppc Not tainted 3.10.4-300.1.fc19.ppc64p7 #1 Call Trace: [c0000000050a8740] .show_stack+0x130/0x200 (unreliable) [c0000000050a8810] .dump_stack+0x28/0x3c [c0000000050a8880] .do_IRQ+0x2b8/0x2c0 [c0000000050a8930] hardware_interrupt_common+0x154/0x180 --- Exception: 501 at .cp_start_xmit+0x3a4/0x820 [8139cp] LR = .cp_start_xmit+0x390/0x820 [8139cp] [c0000000050a8d40] .dev_hard_start_xmit+0x394/0x640 [c0000000050a8e00] .sch_direct_xmit+0x110/0x260 [c0000000050a8ea0] .dev_queue_xmit+0x260/0x630 [c0000000050a8f40] .br_dev_queue_push_xmit+0xc4/0x130 [bridge] [c0000000050a8fc0] .br_dev_xmit+0x198/0x270 [bridge] [c0000000050a9070] .dev_hard_start_xmit+0x394/0x640 [c0000000050a9130] .dev_queue_xmit+0x428/0x630 [c0000000050a91d0] .ip_finish_output+0x2a4/0x550 [c0000000050a9290] .ip_local_out+0x50/0x70 [c0000000050a9310] .ip_queue_xmit+0x148/0x420 [c0000000050a93b0] .tcp_transmit_skb+0x4e4/0xaf0 [c0000000050a94a0] .__tcp_ack_snd_check+0x7c/0xf0 [c0000000050a9520] .tcp_rcv_established+0x1e8/0x930 [c0000000050a95f0] .tcp_v4_do_rcv+0x21c/0x570 [c0000000050a96c0] .tcp_v4_rcv+0x734/0x930 [c0000000050a97a0] .ip_local_deliver_finish+0x184/0x360 [c0000000050a9840] .ip_rcv_finish+0x148/0x400 [c0000000050a98d0] .__netif_receive_skb_core+0x4f8/0xb00 [c0000000050a99d0] .netif_receive_skb+0x44/0x110 [c0000000050a9a70] .br_handle_frame_finish+0x2bc/0x3f0 [bridge] [c0000000050a9b20] .br_nf_pre_routing_finish+0x2ac/0x420 [bridge] [c0000000050a9bd0] .br_nf_pre_routing+0x4dc/0x7d0 [bridge] [c0000000050a9c70] .nf_iterate+0x114/0x130 [c0000000050a9d30] .nf_hook_slow+0xb4/0x1e0 [c0000000050a9e00] .br_handle_frame+0x290/0x330 [bridge] [c0000000050a9ea0] .__netif_receive_skb_core+0x34c/0xb00 [c0000000050a9fa0] .netif_receive_skb+0x44/0x110 [c0000000050aa040] .napi_gro_receive+0xe8/0x120 [c0000000050aa0c0] .cp_rx_poll+0x31c/0x590 [8139cp] [c0000000050aa1d0] .net_rx_action+0x1dc/0x310 [c0000000050aa2b0] .__do_softirq+0x158/0x330 [c0000000050aa3b0] .irq_exit+0xc8/0x110 [c0000000050aa430] .do_IRQ+0xdc/0x2c0 [c0000000050aa4e0] hardware_interrupt_common+0x154/0x180 --- Exception: 501 at .bad_range+0x1c/0x110 LR = .get_page_from_freelist+0x908/0xbb0 [c0000000050aa7d0] .list_del+0x18/0x50 (unreliable) [c0000000050aa850] .get_page_from_freelist+0x908/0xbb0 [c0000000050aa9e0] .__alloc_pages_nodemask+0x21c/0xae0 [c0000000050aaba0] .alloc_pages_vma+0xd0/0x210 [c0000000050aac60] .handle_pte_fault+0x814/0xb70 [c0000000050aad50] .__get_user_pages+0x1a4/0x640 [c0000000050aae60] .get_user_pages_fast+0xec/0x160 [c0000000050aaf10] .__gfn_to_pfn_memslot+0x3b0/0x430 [kvm] [c0000000050aafd0] .kvmppc_gfn_to_pfn+0x64/0x130 [kvm] [c0000000050ab070] .kvmppc_mmu_map_page+0x94/0x530 [kvm] [c0000000050ab190] .kvmppc_handle_pagefault+0x174/0x610 [kvm] [c0000000050ab270] .kvmppc_handle_exit_pr+0x464/0x9b0 [kvm] [c0000000050ab320] kvm_start_lightweight+0x1ec/0x1fc [kvm] [c0000000050ab4f0] .kvmppc_vcpu_run_pr+0x168/0x3b0 [kvm] [c0000000050ab9c0] .kvmppc_vcpu_run+0xc8/0xf0 [kvm] [c0000000050aba50] .kvm_arch_vcpu_ioctl_run+0x5c/0x1a0 [kvm] [c0000000050abae0] .kvm_vcpu_ioctl+0x478/0x730 [kvm] [c0000000050abc90] .do_vfs_ioctl+0x4ec/0x7c0 [c0000000050abd80] .SyS_ioctl+0xd4/0xf0 [c0000000050abe30] syscall_exit+0x0/0x98 Since this is a regression, this patch proposes a minimalistic and low-risk solution by blindly forcing the hardirq exit processing of softirqs on the softirq stack. This way we should reduce significantly the opportunities for task stack overflow dug by softirqs. Longer term solutions may involve extending the hardirq stack coverage to irq_exit(), etc... Reported-by: Benjamin Herrenschmidt Acked-by: Linus Torvalds Signed-off-by: Frederic Weisbecker Cc: #3.9.. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul Mackerras Cc: James Hogan Cc: James E.J. Bottomley Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David S. Miller Cc: Andrew Morton --- kernel/softirq.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 53cc09ceb0b8..d7d498d8cc4f 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -328,10 +328,19 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) - __do_softirq(); - else + if (!force_irqthreads) { + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. But we have no way to know if the arch + * calls irq_exit() on the irq stack. So call softirq + * in its own stack to prevent from any overrun on top + * of a potentially deep task stack. + */ + do_softirq(); + } else { wakeup_softirqd(); + } } static inline void tick_irq_exit(void) -- cgit v1.3-8-gc7d7 From 9886167d20c0720dcfb01e62cdff4d906b226f43 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 3 Oct 2013 16:02:23 +0200 Subject: perf: Fix perf_pmu_migrate_context While auditing the list_entry usage due to a trinity bug I found that perf_pmu_migrate_context violates the rules for perf_event::event_entry. The problem is that perf_event::event_entry is a RCU list element, and hence we must wait for a full RCU grace period before re-using the element after deletion. Therefore the usage in perf_pmu_migrate_context() which re-uses the entry immediately is broken. For now introduce another list_head into perf_event for this specific usage. This doesn't actually fix the trinity report because that never goes through this code. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-mkj72lxagw1z8fvjm648iznw@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 +++++++++++++++++++++++- kernel/events/core.c | 6 +++--- 2 files changed, 26 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 866e85c5eb94..c8ba627c1d60 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -294,9 +294,31 @@ struct ring_buffer; */ struct perf_event { #ifdef CONFIG_PERF_EVENTS - struct list_head group_entry; + /* + * entry onto perf_event_context::event_list; + * modifications require ctx->lock + * RCU safe iterations. + */ struct list_head event_entry; + + /* + * XXX: group_entry and sibling_list should be mutually exclusive; + * either you're a sibling on a group, or you're the group leader. + * Rework the code to always use the same list element. + * + * Locked for modification by both ctx->mutex and ctx->lock; holding + * either sufficies for read. + */ + struct list_head group_entry; struct list_head sibling_list; + + /* + * We need storage to track the entries in perf_pmu_migrate_context; we + * cannot use the event_entry because of RCU and we want to keep the + * group in tact which avoids us using the other two entries. + */ + struct list_head migrate_entry; + struct hlist_node hlist_entry; int nr_siblings; int group_flags; diff --git a/kernel/events/core.c b/kernel/events/core.c index cb4238e85b38..d49a9d29334c 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7234,15 +7234,15 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) perf_remove_from_context(event); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); - list_add(&event->event_entry, &events); + list_add(&event->migrate_entry, &events); } mutex_unlock(&src_ctx->mutex); synchronize_rcu(); mutex_lock(&dst_ctx->mutex); - list_for_each_entry_safe(event, tmp, &events, event_entry) { - list_del(&event->event_entry); + list_for_each_entry_safe(event, tmp, &events, migrate_entry) { + list_del(&event->migrate_entry); if (event->state >= PERF_EVENT_STATE_OFF) event->state = PERF_EVENT_STATE_INACTIVE; account_event_cpu(event, dst_cpu); -- cgit v1.3-8-gc7d7 From ea84753c98a7ac6b74e530b64c444a912b3835ca Mon Sep 17 00:00:00 2001 From: Anjana V Kumar Date: Sat, 12 Oct 2013 10:59:17 +0800 Subject: cgroup: fix to break the while loop in cgroup_attach_task() correctly Both Anjana and Eunki reported a stall in the while_each_thread loop in cgroup_attach_task(). It's because, when we attach a single thread to a cgroup, if the cgroup is exiting or is already in that cgroup, we won't break the loop. If the task is already in the cgroup, the bug can lead to another thread being attached to the cgroup unexpectedly: # echo 5207 > tasks # cat tasks 5207 # echo 5207 > tasks # cat tasks 5207 5215 What's worse, if the task to be attached isn't the leader of the thread group, we might never exit the loop, hence cpu stall. Thanks for Oleg's analysis. This bug was introduced by commit 081aa458c38ba576bdd4265fc807fa95b48b9e79 ("cgroup: consolidate cgroup_attach_task() and cgroup_attach_proc()") [ lizf: - fixed the first continue, pointed out by Oleg, - rewrote changelog. ] Cc: # 3.9+ Reported-by: Eunki Kim Reported-by: Anjana V Kumar Signed-off-by: Anjana V Kumar Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8075b72d22be..1bf4f7a12703 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2038,7 +2038,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) - continue; + goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); @@ -2046,7 +2046,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) - continue; + goto next; /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2054,7 +2054,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - + next: if (!threadgroup) break; } while_each_thread(leader, tsk); -- cgit v1.3-8-gc7d7 From 3090ffb5a2515990182f3f55b0688a7817325488 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 17 Oct 2013 19:32:15 +0200 Subject: perf: Disable PERF_RECORD_MMAP2 support For now, we disable the extended MMAP record support (MMAP2). We have identified cases where it would not report the correct mapping information, clone(VM_CLONE) but with separate pids. We will revisit the support once we find a solution for this case. The patch changes the kernel to return EINVAL if attr->mmap2 is set. The patch also modifies the perf tool to use regular PERF_RECORD_MMAP for synthetic events and it also prevents the tool from requesting attr->mmap2 mode because the kernel would reject it. The support will be revisited once the kenrel interface is updated. In V2, we reduce the patch to the strict minimum. In V3, we avoid calling perf_event_open() with mmap2 set because we know it will fail and require fallback retry. Signed-off-by: Stephane Eranian Cc: Andi Kleen Cc: David Ahern Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20131017173215.GA8820@quad Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++++ tools/perf/util/event.c | 30 +++++++++++++----------------- tools/perf/util/evsel.c | 1 - 3 files changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d29334c..953c14348375 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6767,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + /* disabled for now */ + if (attr->mmap2) + return -EINVAL; + if (attr->__reserved_1) return -EINVAL; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 9b393e7dca6f..63df031fc9c7 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -187,7 +187,7 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, return -1; } - event->header.type = PERF_RECORD_MMAP2; + event->header.type = PERF_RECORD_MMAP; /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c */ @@ -198,7 +198,6 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, char prot[5]; char execname[PATH_MAX]; char anonstr[] = "//anon"; - unsigned int ino; size_t size; ssize_t n; @@ -209,13 +208,10 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, ""); /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n", - &event->mmap2.start, &event->mmap2.len, prot, - &event->mmap2.pgoff, &event->mmap2.maj, - &event->mmap2.min, - &ino, execname); - - event->mmap2.ino = (u64)ino; + n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %*x:%*x %*u %s\n", + &event->mmap.start, &event->mmap.len, prot, + &event->mmap.pgoff, + execname); if (n != 8) continue; @@ -227,15 +223,15 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, anonstr); size = strlen(execname) + 1; - memcpy(event->mmap2.filename, execname, size); + memcpy(event->mmap.filename, execname, size); size = PERF_ALIGN(size, sizeof(u64)); - event->mmap2.len -= event->mmap.start; - event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); - event->mmap2.header.size += machine->id_hdr_size; - event->mmap2.pid = tgid; - event->mmap2.tid = pid; + event->mmap.len -= event->mmap.start; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, machine->id_hdr_size); + event->mmap.header.size += machine->id_hdr_size; + event->mmap.pid = tgid; + event->mmap.tid = pid; if (process(tool, event, &synth_sample, machine) != 0) { rc = -1; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 0ce9febf1ba0..9f1ef9bee2d0 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -678,7 +678,6 @@ void perf_evsel__config(struct perf_evsel *evsel, attr->sample_type |= PERF_SAMPLE_WEIGHT; attr->mmap = track; - attr->mmap2 = track && !perf_missing_features.mmap2; attr->comm = track; /* -- cgit v1.3-8-gc7d7 From b0267507dfd0187fb7840a0ec461a510a7f041c5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Oct 2013 19:45:29 +0900 Subject: mutex: Avoid gcc version dependent __builtin_constant_p() usage Commit 040a0a37 ("mutex: Add support for wound/wait style locks") used "!__builtin_constant_p(p == NULL)" but gcc 3.x cannot handle such expression correctly, leading to boot failure when built with CONFIG_DEBUG_MUTEXES=y. Fix it by explicitly passing a bool which tells whether p != NULL or not. [ PeterZ: This is a sad patch, but provided it actually generates similar code I suppose its the best we can do bar whole sale deprecating gcc-3. ] Signed-off-by: Tetsuo Handa Acked-by: Peter Zijlstra Acked-by: Maarten Lankhorst Cc: peterz@infradead.org Cc: imirkin@alum.mit.edu Cc: daniel.vetter@ffwll.ch Cc: robdclark@gmail.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/201310171945.AGB17114.FSQVtHOJFOOFML@I-love.SAKURA.ne.jp Signed-off-by: Ingo Molnar --- kernel/mutex.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 6d647aedffea..d24105b1b794 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; @@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -551,7 +551,7 @@ slowpath: goto err; } - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { ret = __mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; @@ -575,7 +575,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct mutex_waiter *cur; @@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL); + 0, nest, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) struct mutex *lock = container_of(lock_count, struct mutex, count); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } static noinline int __sched @@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } #endif -- cgit v1.3-8-gc7d7 From 97b9410643475d6557d2517c2aff9fd2221141a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Sep 2013 21:50:23 +0200 Subject: clockevents: Sanitize ticks to nsec conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Marc Kleine-Budde pointed out, that commit 77cc982 "clocksource: use clockevents_config_and_register() where possible" caused a regression for some of the converted subarchs. The reason is, that the clockevents core code converts the minimal hardware tick delta to a nanosecond value for core internal usage. This conversion is affected by integer math rounding loss, so the backwards conversion to hardware ticks will likely result in a value which is less than the configured hardware limitation. The affected subarchs used their own workaround (SIGH!) which got lost in the conversion. The solution for the issue at hand is simple: adding evt->mult - 1 to the shifted value before the integer divison in the core conversion function takes care of it. But this only works for the case where for the scaled math mult/shift pair "mult <= 1 << shift" is true. For the case where "mult > 1 << shift" we can apply the rounding add only for the minimum delta value to make sure that the backward conversion is not less than the given hardware limit. For the upper bound we need to omit the rounding add, because the backwards conversion is always larger than the original latch value. That would violate the upper bound of the hardware device. Though looking closer at the details of that function reveals another bogosity: The upper bounds check is broken as well. Checking for a resulting "clc" value greater than KTIME_MAX after the conversion is pointless. The conversion does: u64 clc = (latch << evt->shift) / evt->mult; So there is no sanity check for (latch << evt->shift) exceeding the 64bit boundary. The latch argument is "unsigned long", so on a 64bit arch the handed in argument could easily lead to an unnoticed shift overflow. With the above rounding fix applied the calculation before the divison is: u64 clc = (latch << evt->shift) + evt->mult - 1; So we need to make sure, that neither the shift nor the rounding add is overflowing the u64 boundary. [ukl: move assignment to rnd after eventually changing mult, fix build issue and correct comment with the right math] Signed-off-by: Thomas Gleixner Cc: Russell King - ARM Linux Cc: Marc Kleine-Budde Cc: nicolas.ferre@atmel.com Cc: Marc Pignat Cc: john.stultz@linaro.org Cc: kernel@pengutronix.de Cc: Ronald Wahl Cc: LAK Cc: Ludovic Desroches Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1380052223-24139-1-git-send-email-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- kernel/time/clockevents.c | 65 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c866789..662c5798a685 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -33,29 +33,64 @@ struct ce_unbind { int res; }; -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** -- cgit v1.3-8-gc7d7 From d3c345dbc7c083414ef74eb22ff26ba2bd100759 Mon Sep 17 00:00:00 2001 From: Russ Dill Date: Thu, 24 Oct 2013 14:25:26 +0100 Subject: PM / hibernate: Move software_resume to late_initcall_sync software_resume is being called after deferred_probe_initcall in drivers base. If the probing of the device that contains the resume image is deferred, and the system has been instructed to wait for it to show up, this wait will occur in software_resume. This causes a deadlock. Move software_resume into late_initcall_sync so that it happens after all the other late_initcalls. Signed-off-by: Russ Dill Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9c759d5a15c..0121dab83f43 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -846,7 +846,7 @@ static int software_resume(void) goto Finish; } -late_initcall(software_resume); +late_initcall_sync(software_resume); static const char * const hibernation_modes[] = { -- cgit v1.3-8-gc7d7 From bf378d341e4873ed928dc3c636252e6895a21f50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 28 Oct 2013 13:55:29 +0100 Subject: perf: Fix perf ring buffer memory ordering The PPC64 people noticed a missing memory barrier and crufty old comments in the perf ring buffer code. So update all the comments and add the missing barrier. When the architecture implements local_t using atomic_long_t there will be double barriers issued; but short of introducing more conditional barrier primitives this is the best we can do. Reported-by: Victor Kaplansky Tested-by: Victor Kaplansky Signed-off-by: Peter Zijlstra Cc: Mathieu Desnoyers Cc: michael@ellerman.id.au Cc: Paul McKenney Cc: Michael Neuling Cc: Frederic Weisbecker Cc: anton@samba.org Cc: benh@kernel.crashing.org Link: http://lkml.kernel.org/r/20131025173749.GG19466@laptop.lan Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 12 +++++++----- kernel/events/ring_buffer.c | 31 +++++++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 009a655a5d35..2fc1602e23bb 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -456,13 +456,15 @@ struct perf_event_mmap_page { /* * Control data for the mmap() data buffer. * - * User-space reading the @data_head value should issue an rmb(), on - * SMP capable platforms, after reading this value -- see - * perf_event_wakeup(). + * User-space reading the @data_head value should issue an smp_rmb(), + * after reading this value. * * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data. In this case - * the kernel will not over-write unread data. + * written by userspace to reflect the last read data, after issueing + * an smp_mb() to separate the data read from the ->data_tail store. + * In this case the kernel will not over-write unread data. + * + * See perf_output_put_handle() for the data ordering. */ __u64 data_head; /* head in the data section */ __u64 data_tail; /* user-space written tail */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index cd55144270b5..9c2ddfbf4525 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -87,10 +87,31 @@ again: goto out; /* - * Publish the known good head. Rely on the full barrier implied - * by atomic_dec_and_test() order the rb->head read and this - * write. + * Since the mmap() consumer (userspace) can run on a different CPU: + * + * kernel user + * + * READ ->data_tail READ ->data_head + * smp_mb() (A) smp_rmb() (C) + * WRITE $data READ $data + * smp_wmb() (B) smp_mb() (D) + * STORE ->data_head WRITE ->data_tail + * + * Where A pairs with D, and B pairs with C. + * + * I don't think A needs to be a full barrier because we won't in fact + * write data until we see the store from userspace. So we simply don't + * issue the data WRITE until we observe it. Be conservative for now. + * + * OTOH, D needs to be a full barrier since it separates the data READ + * from the tail WRITE. + * + * For B a WMB is sufficient since it separates two WRITEs, and for C + * an RMB is sufficient since it separates two READs. + * + * See perf_output_begin(). */ + smp_wmb(); rb->user_page->data_head = head; /* @@ -154,9 +175,11 @@ int perf_output_begin(struct perf_output_handle *handle, * Userspace could choose to issue a mb() before updating the * tail pointer. So that all reads will be completed before the * write is issued. + * + * See perf_output_put_handle(). */ tail = ACCESS_ONCE(rb->user_page->data_tail); - smp_rmb(); + smp_mb(); offset = head = local_read(&rb->head); head += size; if (unlikely(!perf_output_space(rb, tail, offset, head))) -- cgit v1.3-8-gc7d7