From 0f67f04ffcb592d065a20862a82d4539e0f8e909 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Feb 2015 11:56:20 -0500 Subject: tracing: Only create tracer options files if directory exists Do not bother creating tracer options if no tracing directory exists. If a tracer is enabled via the command line, and is started before the tracing directory is created, then it wont have its tracer specific options created. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 38c613ede10d..d4627f15407a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4172,8 +4172,11 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - /* Currently, only the top instance has options */ - if (tr->flags & TRACE_ARRAY_FL_GLOBAL) { + /* + * Only enable if the directory has been created already. + * Currently, only the top instance has options + */ + if (tr->dir && tr->flags & TRACE_ARRAY_FL_GLOBAL) { destroy_trace_option_files(topts); topts = create_trace_option_files(tr, t); } -- cgit v1.3-8-gc7d7 From 09d23a1d8a82e814bd56a4f121b80ea8214ac49d Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 3 Feb 2015 12:45:53 -0500 Subject: tracing: Create cmdline tracer options on tracing fs init The options for cmdline tracers are not created if the debugfs system is not ready yet. If tracing has started before debugfs is up, then the option files for the tracer are not created. Create them when creating the tracing directory if the current tracer requires option files. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d4627f15407a..05e0e50539fc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4105,9 +4105,24 @@ static void tracing_set_nop(struct trace_array *tr) tr->current_trace = &nop_trace; } -static int tracing_set_tracer(struct trace_array *tr, const char *buf) +static void update_tracer_options(struct trace_array *tr, struct tracer *t) { static struct trace_option_dentry *topts; + + /* Only enable if the directory has been created already. */ + if (!tr->dir) + return; + + /* Currently, only the top instance has options */ + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL)) + return; + + destroy_trace_option_files(topts); + topts = create_trace_option_files(tr, t); +} + +static int tracing_set_tracer(struct trace_array *tr, const char *buf) +{ struct tracer *t; #ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; @@ -4172,14 +4187,7 @@ static int tracing_set_tracer(struct trace_array *tr, const char *buf) free_snapshot(tr); } #endif - /* - * Only enable if the directory has been created already. - * Currently, only the top instance has options - */ - if (tr->dir && tr->flags & TRACE_ARRAY_FL_GLOBAL) { - destroy_trace_option_files(topts); - topts = create_trace_option_files(tr, t); - } + update_tracer_options(tr, t); #ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { @@ -6578,6 +6586,10 @@ static __init int tracer_init_debugfs(void) create_trace_options_dir(&global_trace); + /* If the tracer was started via cmdline, create options for it here */ + if (global_trace.current_trace != &nop_trace) + update_tracer_options(&global_trace, global_trace.current_trace); + return 0; } -- cgit v1.3-8-gc7d7 From 8434dc9340cd2e117fc944cf7526263bf490a52a Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 12:13:40 -0500 Subject: tracing: Convert the tracing facility over to use tracefs debugfs was fine for the tracing facility as a quick way to get an interface. Now that tracing has matured, it should separate itself from debugfs such that it can be mounted separately without needing to mount all of debugfs with it. That is, users resist using tracing because it requires mounting debugfs. Having tracing have its own file system lets users get the features of tracing without needing to bring in the rest of the kernel's debug infrastructure. Another reason for tracefs is that debubfs does not support mkdir. Currently, to create instances, one does a mkdir in the tracing/instance directory. This is implemented via a hack that forces debugfs to do something it is not intended on doing. By converting over to tracefs, this hack can be removed and mkdir can be properly implemented. This patch does not address this yet, but it lays the ground work for that to be done. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 22 +++++++-------- kernel/trace/trace.c | 55 +++++++++++++++++++++--------------- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 32 ++++++++++----------- kernel/trace/trace_functions_graph.c | 7 ++--- kernel/trace/trace_kprobe.c | 10 +++---- kernel/trace/trace_probe.h | 2 +- kernel/trace/trace_stat.c | 10 +++---- 8 files changed, 74 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45e5cb143d17..fcc0e7052a79 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -1008,7 +1008,7 @@ static struct tracer_stat function_stats __initdata = { .stat_show = function_stat_show }; -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { struct ftrace_profile_stat *stat; struct dentry *entry; @@ -1044,15 +1044,15 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) } } - entry = debugfs_create_file("function_profile_enabled", 0644, + entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ -static __init void ftrace_profile_debugfs(struct dentry *d_tracer) +static __init void ftrace_profile_tracefs(struct dentry *d_tracer) { } #endif /* CONFIG_FUNCTION_PROFILER */ @@ -4690,7 +4690,7 @@ void ftrace_destroy_filter_files(struct ftrace_ops *ops) mutex_unlock(&ftrace_lock); } -static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) +static __init int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { trace_create_file("available_filter_functions", 0444, @@ -4998,7 +4998,7 @@ static int __init ftrace_nodyn_init(void) } core_initcall(ftrace_nodyn_init); -static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; } +static inline int ftrace_init_dyn_tracefs(struct dentry *d_tracer) { return 0; } static inline void ftrace_startup_enable(int command) { } static inline void ftrace_startup_all(int command) { } /* Keep as macros so we do not need to define the commands */ @@ -5451,7 +5451,7 @@ static const struct file_operations ftrace_pid_fops = { .release = ftrace_pid_release, }; -static __init int ftrace_init_debugfs(void) +static __init int ftrace_init_tracefs(void) { struct dentry *d_tracer; @@ -5459,16 +5459,16 @@ static __init int ftrace_init_debugfs(void) if (IS_ERR(d_tracer)) return 0; - ftrace_init_dyn_debugfs(d_tracer); + ftrace_init_dyn_tracefs(d_tracer); trace_create_file("set_ftrace_pid", 0644, d_tracer, NULL, &ftrace_pid_fops); - ftrace_profile_debugfs(d_tracer); + ftrace_profile_tracefs(d_tracer); return 0; } -fs_initcall(ftrace_init_debugfs); +fs_initcall(ftrace_init_tracefs); /** * ftrace_kill - kill ftrace diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 05e0e50539fc..6c4739bee4bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -5828,6 +5829,14 @@ static inline __init int register_snapshot_cmd(void) { return 0; } static struct dentry *tracing_get_dentry(struct trace_array *tr) { + if (WARN_ON(!tr->dir)) + return ERR_PTR(-ENODEV); + + /* Top directory uses NULL as the parent */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return NULL; + + /* All sub buffers have a descriptor */ return tr->dir; } @@ -5842,10 +5851,10 @@ static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) if (IS_ERR(d_tracer)) return NULL; - tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = tracefs_create_dir("per_cpu", d_tracer); WARN_ONCE(!tr->percpu_dir, - "Could not create debugfs directory 'per_cpu/%d'\n", cpu); + "Could not create tracefs directory 'per_cpu/%d'\n", cpu); return tr->percpu_dir; } @@ -5862,7 +5871,7 @@ trace_create_cpu_file(const char *name, umode_t mode, struct dentry *parent, } static void -tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) +tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; @@ -5872,9 +5881,9 @@ tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) return; snprintf(cpu_dir, 30, "cpu%ld", cpu); - d_cpu = debugfs_create_dir(cpu_dir, d_percpu); + d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create debugfs '%s' entry\n", cpu_dir); + pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6026,9 +6035,9 @@ struct dentry *trace_create_file(const char *name, { struct dentry *ret; - ret = debugfs_create_file(name, mode, parent, data, fops); + ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create debugfs '%s' entry\n", name); + pr_warning("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6045,9 +6054,9 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) if (IS_ERR(d_tracer)) return NULL; - tr->options = debugfs_create_dir("options", d_tracer); + tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create debugfs directory 'options'\n"); + pr_warning("Could not create tracefs directory 'options'\n"); return NULL; } @@ -6116,7 +6125,7 @@ destroy_trace_option_files(struct trace_option_dentry *topts) return; for (cnt = 0; topts[cnt].opt; cnt++) - debugfs_remove(topts[cnt].entry); + tracefs_remove(topts[cnt].entry); kfree(topts); } @@ -6205,7 +6214,7 @@ static const struct file_operations rb_simple_fops = { struct dentry *trace_instance_dir; static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer); static int allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) @@ -6321,17 +6330,17 @@ static int new_instance_create(const char *name) if (allocate_trace_buffers(tr, trace_buf_size) < 0) goto out_free_tr; - tr->dir = debugfs_create_dir(name, trace_instance_dir); + tr->dir = tracefs_create_dir(name, trace_instance_dir); if (!tr->dir) goto out_free_tr; ret = event_trace_add_tracer(tr->dir, tr); if (ret) { - debugfs_remove_recursive(tr->dir); + tracefs_remove_recursive(tr->dir); goto out_free_tr; } - init_tracer_debugfs(tr, tr->dir); + init_tracer_tracefs(tr, tr->dir); list_add(&tr->list, &ftrace_trace_arrays); @@ -6404,7 +6413,7 @@ static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t m return -ENOENT; /* - * The inode mutex is locked, but debugfs_create_dir() will also + * The inode mutex is locked, but tracefs_create_dir() will also * take the mutex. As the instances directory can not be destroyed * or changed in any other way, it is safe to unlock it, and * let the dentry try. If two users try to make the same dir at @@ -6434,7 +6443,7 @@ static int instance_rmdir(struct inode *inode, struct dentry *dentry) mutex_unlock(&dentry->d_inode->i_mutex); /* - * The inode mutex is locked, but debugfs_create_dir() will also + * The inode mutex is locked, but tracefs_create_dir() will also * take the mutex. As the instances directory can not be destroyed * or changed in any other way, it is safe to unlock it, and * let the dentry try. If two users try to make the same dir at @@ -6459,7 +6468,7 @@ static const struct inode_operations instance_dir_inode_operations = { static __init void create_trace_instances(struct dentry *d_tracer) { - trace_instance_dir = debugfs_create_dir("instances", d_tracer); + trace_instance_dir = tracefs_create_dir("instances", d_tracer); if (WARN_ON(!trace_instance_dir)) return; @@ -6468,7 +6477,7 @@ static __init void create_trace_instances(struct dentry *d_tracer) } static void -init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) { int cpu; @@ -6522,7 +6531,7 @@ init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) #endif for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(tr, cpu); + tracing_init_tracefs_percpu(tr, cpu); } @@ -6550,10 +6559,10 @@ struct dentry *tracing_init_dentry(void) return ERR_PTR(-ENOMEM); } - return tr->dir; + return NULL; } -static __init int tracer_init_debugfs(void) +static __init int tracer_init_tracefs(void) { struct dentry *d_tracer; @@ -6563,7 +6572,7 @@ static __init int tracer_init_debugfs(void) if (IS_ERR(d_tracer)) return 0; - init_tracer_debugfs(&global_trace, d_tracer); + init_tracer_tracefs(&global_trace, d_tracer); trace_create_file("tracing_thresh", 0644, d_tracer, &global_trace, &tracing_thresh_fops); @@ -6925,5 +6934,5 @@ __init static int clear_boot_tracer(void) return 0; } -fs_initcall(tracer_init_debugfs); +fs_initcall(tracer_init_tracefs); late_initcall(clear_boot_tracer); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd8205a35760..d951deddec89 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -334,7 +334,7 @@ struct tracer_flags { /** - * struct tracer - a specific tracer and its callbacks to interact with debugfs + * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index db54dda10ccc..0d2e47370ee7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -480,7 +480,7 @@ static void remove_subsystem(struct ftrace_subsystem_dir *dir) return; if (!--dir->nr_events) { - debugfs_remove_recursive(dir->entry); + tracefs_remove_recursive(dir->entry); list_del(&dir->list); __put_system_dir(dir); } @@ -499,7 +499,7 @@ static void remove_event_file_dir(struct ftrace_event_file *file) } spin_unlock(&dir->d_lock); - debugfs_remove_recursive(dir); + tracefs_remove_recursive(dir); } list_del(&file->list); @@ -1526,7 +1526,7 @@ event_subsystem_dir(struct trace_array *tr, const char *name, } else __get_system(system); - dir->entry = debugfs_create_dir(name, parent); + dir->entry = tracefs_create_dir(name, parent); if (!dir->entry) { pr_warn("Failed to create system directory %s\n", name); __put_system(system); @@ -1539,12 +1539,12 @@ event_subsystem_dir(struct trace_array *tr, const char *name, dir->subsystem = system; file->system = dir; - entry = debugfs_create_file("filter", 0644, dir->entry, dir, + entry = tracefs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warn("Could not create debugfs '%s/filter' entry\n", name); + pr_warn("Could not create tracefs '%s/filter' entry\n", name); } trace_create_file("enable", 0644, dir->entry, dir, @@ -1585,9 +1585,9 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) d_events = parent; name = ftrace_event_name(call); - file->dir = debugfs_create_dir(name, d_events); + file->dir = tracefs_create_dir(name, d_events); if (!file->dir) { - pr_warn("Could not create debugfs '%s' directory\n", name); + pr_warn("Could not create tracefs '%s' directory\n", name); return -1; } @@ -2228,7 +2228,7 @@ static inline int register_event_cmds(void) { return 0; } /* * The top level array has already had its ftrace_event_file * descriptors created in order to allow for early events to - * be recorded. This function is called after the debugfs has been + * be recorded. This function is called after the tracefs has been * initialized, and we now have to create the files associated * to the events. */ @@ -2311,16 +2311,16 @@ create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) struct dentry *d_events; struct dentry *entry; - entry = debugfs_create_file("set_event", 0644, parent, + entry = tracefs_create_file("set_event", 0644, parent, tr, &ftrace_set_event_fops); if (!entry) { - pr_warn("Could not create debugfs 'set_event' entry\n"); + pr_warn("Could not create tracefs 'set_event' entry\n"); return -ENOMEM; } - d_events = debugfs_create_dir("events", parent); + d_events = tracefs_create_dir("events", parent); if (!d_events) { - pr_warn("Could not create debugfs 'events' directory\n"); + pr_warn("Could not create tracefs 'events' directory\n"); return -ENOMEM; } @@ -2412,7 +2412,7 @@ int event_trace_del_tracer(struct trace_array *tr) down_write(&trace_event_sem); __trace_remove_event_dirs(tr); - debugfs_remove_recursive(tr->event_dir); + tracefs_remove_recursive(tr->event_dir); up_write(&trace_event_sem); tr->event_dir = NULL; @@ -2534,10 +2534,10 @@ static __init int event_trace_init(void) if (IS_ERR(d_tracer)) return 0; - entry = debugfs_create_file("available_events", 0444, d_tracer, + entry = tracefs_create_file("available_events", 0444, d_tracer, tr, &ftrace_avail_fops); if (!entry) - pr_warn("Could not create debugfs 'available_events' entry\n"); + pr_warn("Could not create tracefs 'available_events' entry\n"); if (trace_define_common_fields()) pr_warn("tracing: Failed to allocate common fields"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 2d25ad1526bb..9cfea4c6d314 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -6,7 +6,6 @@ * is Copyright (c) Steven Rostedt * */ -#include #include #include #include @@ -151,7 +150,7 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, * The curr_ret_stack is initialized to -1 and get increased * in this function. So it can be less than -1 only if it was * filtered out via ftrace_graph_notrace_addr() which can be - * set from set_graph_notrace file in debugfs by user. + * set from set_graph_notrace file in tracefs by user. */ if (current->curr_ret_stack < -1) return -EBUSY; @@ -1432,7 +1431,7 @@ static const struct file_operations graph_depth_fops = { .llseek = generic_file_llseek, }; -static __init int init_graph_debugfs(void) +static __init int init_graph_tracefs(void) { struct dentry *d_tracer; @@ -1445,7 +1444,7 @@ static __init int init_graph_debugfs(void) return 0; } -fs_initcall(init_graph_debugfs); +fs_initcall(init_graph_tracefs); static __init int init_graph_trace(void) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index b4a00def88f5..c1c6655847c8 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1310,7 +1310,7 @@ static int unregister_kprobe_event(struct trace_kprobe *tk) return ret; } -/* Make a debugfs interface for controlling probe points */ +/* Make a tracefs interface for controlling probe points */ static __init int init_kprobe_trace(void) { struct dentry *d_tracer; @@ -1323,20 +1323,20 @@ static __init int init_kprobe_trace(void) if (IS_ERR(d_tracer)) return 0; - entry = debugfs_create_file("kprobe_events", 0644, d_tracer, + entry = tracefs_create_file("kprobe_events", 0644, d_tracer, NULL, &kprobe_events_ops); /* Event list interface */ if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'kprobe_events' entry\n"); /* Profile interface */ - entry = debugfs_create_file("kprobe_profile", 0444, d_tracer, + entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'kprobe_profile' entry\n"); return 0; } diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..19aff635841a 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 75e19e86c954..6cf935316769 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "trace_stat.h" #include "trace.h" @@ -65,7 +65,7 @@ static void reset_stat_session(struct stat_session *session) static void destroy_session(struct stat_session *session) { - debugfs_remove(session->file); + tracefs_remove(session->file); __reset_stat_session(session); mutex_destroy(&session->stat_mutex); kfree(session); @@ -279,9 +279,9 @@ static int tracing_stat_init(void) if (IS_ERR(d_tracing)) return 0; - stat_dir = debugfs_create_dir("trace_stat", d_tracing); + stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create debugfs " + pr_warning("Could not create tracefs " "'trace_stat' entry\n"); return 0; } @@ -291,7 +291,7 @@ static int init_stat_file(struct stat_session *session) if (!stat_dir && tracing_stat_init()) return -ENODEV; - session->file = debugfs_create_file(session->ts->name, 0644, + session->file = tracefs_create_file(session->ts->name, 0644, stat_dir, session, &tracing_stat_fops); if (!session->file) -- cgit v1.3-8-gc7d7 From f76180bc07abc399977bfbe8c43bf58c4570e893 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 20 Jan 2015 15:48:46 -0500 Subject: tracing: Automatically mount tracefs on debugfs/tracing As tools currently rely on the tracing directory in debugfs, we can not just created a tracefs infrastructure and expect sysadmins to mount the new tracefs to have their old tools work. Instead, the debugfs tracing directory is still created and the tracefs file system is mounted there when the debugfs filesystem is mounted. No longer does the tracing infrastructure update the debugfs file system, but instead interacts with the tracefs file system. But now, it still appears to the user like nothing changed, except you also have the feature of mounting just the tracing system without needing all of debugfs! Cc: Al Viro Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c4739bee4bb..b4aa936509d2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -6535,6 +6536,28 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer) } +static struct vfsmount *trace_automount(void *ingore) +{ + struct vfsmount *mnt; + struct file_system_type *type; + + /* + * To maintain backward compatibility for tools that mount + * debugfs to get to the tracing facility, tracefs is automatically + * mounted to the debugfs/tracing directory. + */ + type = get_fs_type("tracefs"); + if (!type) + return NULL; + mnt = vfs_kern_mount(type, 0, "tracefs", NULL); + put_filesystem(type); + if (IS_ERR(mnt)) + return NULL; + mntget(mnt); + + return mnt; +} + /** * tracing_init_dentry - initialize top level trace array * @@ -6546,14 +6569,21 @@ struct dentry *tracing_init_dentry(void) { struct trace_array *tr = &global_trace; + /* The top level trace array uses NULL as parent */ if (tr->dir) - return tr->dir; + return NULL; if (WARN_ON(!debugfs_initialized())) return ERR_PTR(-ENODEV); - tr->dir = debugfs_create_dir("tracing", NULL); - + /* + * As there may still be users that expect the tracing + * files to exist in debugfs/tracing, we must automount + * the tracefs file system there, so older tools still + * work with the newer kerenl. + */ + tr->dir = debugfs_create_automount("tracing", NULL, + trace_automount, NULL); if (!tr->dir) { pr_warn_once("Could not create debugfs directory 'tracing'\n"); return ERR_PTR(-ENOMEM); -- cgit v1.3-8-gc7d7 From eae473581cf93dad94ca833aa961c033c6a43924 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 21 Jan 2015 10:01:39 -0500 Subject: tracing: Have mkdir and rmdir be part of tracefs The tracing "instances" directory can create sub tracing buffers with mkdir, and remove them with rmdir. As a mkdir will also create all the files and directories that control the sub buffer the inode mutexes need to be released before this is done, to avoid deadlocks. It is better to let the tracing system unlock the inode mutexes before calling the functions that create the files within the new directory (or deletes the files from the one being destroyed). Now that tracing has been converted over to tracefs, the tracefs file system can be modified to accommodate this feature. It still releases the locks, but the filesystem itself can take care of the ugly business and let the user just do what it needs. The tracing system now attaches a descriptor to the directory dentry that can have userspace create or remove sub directories. If this descriptor does not exist for a dentry, then that dentry can not be used to create other directories. This descriptor holds a mkdir and rmdir method that only takes a character string as an argument. The tracefs file system will first make a copy of the dentry name before releasing the locks. Then it will pass the copied name to the methods. It is up to the tracing system that supplied the methods to handle races with duplicate names and such as all the inode mutexes would be released when the functions are called. Cc: Al Viro Signed-off-by: Steven Rostedt --- fs/tracefs/inode.c | 151 +++++++++++++++++++++++++++++++++++++++++++----- include/linux/tracefs.h | 4 ++ kernel/trace/trace.c | 75 ++---------------------- 3 files changed, 145 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0b9cf5cf24c9..d92bdf3b079a 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -50,6 +50,84 @@ static const struct file_operations tracefs_file_operations = { .llseek = noop_llseek, }; +static struct tracefs_dir_ops { + int (*mkdir)(const char *name); + int (*rmdir)(const char *name); +} tracefs_ops; + +static char *get_dname(struct dentry *dentry) +{ + const char *dname; + char *name; + int len = dentry->d_name.len; + + dname = dentry->d_name.name; + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) + return NULL; + memcpy(name, dname, len); + name[len] = 0; + return name; +} + +static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The mkdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * mkdir routine to handle races. + */ + mutex_unlock(&inode->i_mutex); + ret = tracefs_ops.mkdir(name); + mutex_lock(&inode->i_mutex); + + kfree(name); + + return ret; +} + +static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) +{ + char *name; + int ret; + + name = get_dname(dentry); + if (!name) + return -ENOMEM; + + /* + * The rmdir call can call the generic functions that create + * the files within the tracefs system. It is up to the individual + * rmdir routine to handle races. + * This time we need to unlock not only the parent (inode) but + * also the directory that is being deleted. + */ + mutex_unlock(&inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); + + ret = tracefs_ops.rmdir(name); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + kfree(name); + + return ret; +} + +static const struct inode_operations tracefs_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = tracefs_syscall_mkdir, + .rmdir = tracefs_syscall_rmdir, +}; + static struct inode *tracefs_get_inode(struct super_block *sb) { struct inode *inode = new_inode(sb); @@ -334,6 +412,31 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, return end_creating(dentry); } +static struct dentry *__create_dir(const char *name, struct dentry *parent, + const struct inode_operations *ops) +{ + struct dentry *dentry = start_creating(name, parent); + struct inode *inode; + + if (IS_ERR(dentry)) + return NULL; + + inode = tracefs_get_inode(dentry->d_sb); + if (unlikely(!inode)) + return failed_creating(dentry); + + inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; + inode->i_op = ops; + inode->i_fop = &simple_dir_operations; + + /* directory inodes start off with i_nlink == 2 (for "." entry) */ + inc_nlink(inode); + d_instantiate(dentry, inode); + inc_nlink(dentry->d_parent->d_inode); + fsnotify_mkdir(dentry->d_parent->d_inode, dentry); + return end_creating(dentry); +} + /** * tracefs_create_dir - create a directory in the tracefs filesystem * @name: a pointer to a string containing the name of the directory to @@ -353,26 +456,44 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, */ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent) { - struct dentry *dentry = start_creating(name, parent); - struct inode *inode; + return __create_dir(name, parent, &simple_dir_inode_operations); +} - if (IS_ERR(dentry)) +/** + * tracefs_create_instance_dir - create the tracing instances directory + * @name: The name of the instances directory to create + * @parent: The parent directory that the instances directory will exist + * @mkdir: The function to call when a mkdir is performed. + * @rmdir: The function to call when a rmdir is performed. + * + * Only one instances directory is allowed. + * + * The instances directory is special as it allows for mkdir and rmdir to + * to be done by userspace. When a mkdir or rmdir is performed, the inode + * locks are released and the methhods passed in (@mkdir and @rmdir) are + * called without locks and with the name of the directory being created + * within the instances directory. + * + * Returns the dentry of the instances directory. + */ +struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, + int (*mkdir)(const char *name), + int (*rmdir)(const char *name)) +{ + struct dentry *dentry; + + /* Only allow one instance of the instances directory. */ + if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir)) return NULL; - inode = tracefs_get_inode(dentry->d_sb); - if (unlikely(!inode)) - return failed_creating(dentry); + dentry = __create_dir(name, parent, &tracefs_dir_inode_operations); + if (!dentry) + return NULL; - inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO; - inode->i_op = &simple_dir_inode_operations; - inode->i_fop = &simple_dir_operations; + tracefs_ops.mkdir = mkdir; + tracefs_ops.rmdir = rmdir; - /* directory inodes start off with i_nlink == 2 (for "." entry) */ - inc_nlink(inode); - d_instantiate(dentry, inode); - inc_nlink(dentry->d_parent->d_inode); - fsnotify_mkdir(dentry->d_parent->d_inode, dentry); - return end_creating(dentry); + return dentry; } static inline int tracefs_positive(struct dentry *dentry) diff --git a/include/linux/tracefs.h b/include/linux/tracefs.h index 23e04ce21749..5b727a17beee 100644 --- a/include/linux/tracefs.h +++ b/include/linux/tracefs.h @@ -34,6 +34,10 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent); void tracefs_remove(struct dentry *dentry); void tracefs_remove_recursive(struct dentry *dentry); +struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *parent, + int (*mkdir)(const char *name), + int (*rmdir)(const char *name)); + bool tracefs_initialized(void); #endif /* CONFIG_TRACING */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b4aa936509d2..3c8913bac204 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6292,7 +6292,7 @@ static void free_trace_buffers(struct trace_array *tr) #endif } -static int new_instance_create(const char *name) +static int instance_mkdir(const char *name) { struct trace_array *tr; int ret; @@ -6362,7 +6362,7 @@ static int new_instance_create(const char *name) } -static int instance_delete(const char *name) +static int instance_rmdir(const char *name) { struct trace_array *tr; int found = 0; @@ -6403,78 +6403,13 @@ static int instance_delete(const char *name) return ret; } -static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) -{ - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* - * The inode mutex is locked, but tracefs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the new_instance_create() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); - - ret = new_instance_create(dentry->d_iname); - - mutex_lock(&inode->i_mutex); - - return ret; -} - -static int instance_rmdir(struct inode *inode, struct dentry *dentry) -{ - struct dentry *parent; - int ret; - - /* Paranoid: Make sure the parent is the "instances" directory */ - parent = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); - if (WARN_ON_ONCE(parent != trace_instance_dir)) - return -ENOENT; - - /* The caller did a dget() on dentry */ - mutex_unlock(&dentry->d_inode->i_mutex); - - /* - * The inode mutex is locked, but tracefs_create_dir() will also - * take the mutex. As the instances directory can not be destroyed - * or changed in any other way, it is safe to unlock it, and - * let the dentry try. If two users try to make the same dir at - * the same time, then the instance_delete() will determine the - * winner. - */ - mutex_unlock(&inode->i_mutex); - - ret = instance_delete(dentry->d_iname); - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); - - return ret; -} - -static const struct inode_operations instance_dir_inode_operations = { - .lookup = simple_lookup, - .mkdir = instance_mkdir, - .rmdir = instance_rmdir, -}; - static __init void create_trace_instances(struct dentry *d_tracer) { - trace_instance_dir = tracefs_create_dir("instances", d_tracer); + trace_instance_dir = tracefs_create_instance_dir("instances", d_tracer, + instance_mkdir, + instance_rmdir); if (WARN_ON(!trace_instance_dir)) return; - - /* Hijack the dir inode operations, to allow mkdir */ - trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; } static void -- cgit v1.3-8-gc7d7 From 74390aa5567827add5058a3b26eff0ed06a629ba Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 27 Jan 2015 17:55:12 +0800 Subject: perf: Remove the extra validity check on nr_pages The function is_power_of_2() also do the check on nr_pages, so the first check performed is unnecessary. On the other hand, the key point is to ensure @nr_pages is a power-of-two number and mostly @nr_pages is a nonzero value, so in the most cases, the function is_power_of_2() will be called. Signed-off-by: Kaixu Xia Cc: Peter Zijlstra Cc: Paul Mackerras Link: http://lkml.kernel.org/r/1422352512-75150-1-git-send-email-xiakaixu@huawei.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 7f2fbb8b5069..0969c9b67eec 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4420,7 +4420,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) + if (!is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) -- cgit v1.3-8-gc7d7 From e0b561ee78d82a4cc7792aa28fa4b1ea15325dcc Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Sun, 15 Feb 2015 10:03:20 +0100 Subject: livepatch: fix format string in kobject_init_and_add() kobject_init_and_add() takes expects format string for a name, so we better provide it in order to avoid infoleaks if modules craft their mod->name in a special way. Reported-by: Fengguang Wu Reported-by: Kees Cook Acked-by: Seth Jennings Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ff7f47d026ac..69bf3aa3bde8 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -731,7 +731,7 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) func->state = KLP_DISABLED; return kobject_init_and_add(&func->kobj, &klp_ktype_func, - obj->kobj, func->old_name); + obj->kobj, "%s", func->old_name); } /* parts of the initialization that is done only when the object is loaded */ @@ -807,7 +807,7 @@ static int klp_init_patch(struct klp_patch *patch) patch->state = KLP_DISABLED; ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, - klp_root_kobj, patch->mod->name); + klp_root_kobj, "%s", patch->mod->name); if (ret) goto unlock; -- cgit v1.3-8-gc7d7 From 02cea3958664723a5d2236f0f0058de97c7e4693 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Feb 2015 14:06:23 +0100 Subject: genirq: Provide disable_hardirq() For things like netpoll there is a need to disable an interrupt from atomic context. Currently netpoll uses disable_irq() which will sleep-wait on threaded handlers and thus forced_irqthreads breaks things. Provide disable_hardirq(), which uses synchronize_hardirq() to only wait for active hardirq handlers; also change synchronize_hardirq() to return the status of threaded handlers. This will allow one to try-disable an interrupt from atomic context, or in case of request_threaded_irq() to only wait for the hardirq part. Suggested-by: Sabrina Dubroca Signed-off-by: Peter Zijlstra (Intel) Cc: Arnd Bergmann Cc: David Miller Cc: Eyal Perry Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Quentin Lambert Cc: Randy Dunlap Cc: Russell King Link: http://lkml.kernel.org/r/20150205130623.GH5029@twins.programming.kicks-ass.net [ Fixed typos and such. ] Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 2 +- include/linux/interrupt.h | 1 + kernel/irq/manage.c | 36 ++++++++++++++++++++++++++++++++++-- 3 files changed, 36 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index cba442ec3c66..f4af03404b97 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -9,7 +9,7 @@ extern void synchronize_irq(unsigned int irq); -extern void synchronize_hardirq(unsigned int irq); +extern bool synchronize_hardirq(unsigned int irq); #if defined(CONFIG_TINY_RCU) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index d9b05b5bf8c7..3bb01b9a379c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -184,6 +184,7 @@ extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id); #endif extern void disable_irq_nosync(unsigned int irq); +extern bool disable_hardirq(unsigned int irq); extern void disable_irq(unsigned int irq); extern void disable_percpu_irq(unsigned int irq); extern void enable_irq(unsigned int irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 196a06fbc122..03329c2287eb 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -68,14 +68,20 @@ static void __synchronize_hardirq(struct irq_desc *desc) * Do not use this for shutdown scenarios where you must be sure * that all parts (hardirq and threaded handler) have completed. * + * Returns: false if a threaded handler is active. + * * This function may be called - with care - from IRQ context. */ -void synchronize_hardirq(unsigned int irq) +bool synchronize_hardirq(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - if (desc) + if (desc) { __synchronize_hardirq(desc); + return !atomic_read(&desc->threads_active); + } + + return true; } EXPORT_SYMBOL(synchronize_hardirq); @@ -440,6 +446,32 @@ void disable_irq(unsigned int irq) } EXPORT_SYMBOL(disable_irq); +/** + * disable_hardirq - disables an irq and waits for hardirq completion + * @irq: Interrupt to disable + * + * Disable the selected interrupt line. Enables and Disables are + * nested. + * This function waits for any pending hard IRQ handlers for this + * interrupt to complete before returning. If you use this function while + * holding a resource the hard IRQ handler may need you will deadlock. + * + * When used to optimistically disable an interrupt from atomic context + * the return value must be checked. + * + * Returns: false if a threaded handler is active. + * + * This function may be called - with care - from IRQ context. + */ +bool disable_hardirq(unsigned int irq) +{ + if (!__disable_irq_nosync(irq)) + return synchronize_hardirq(irq); + + return false; +} +EXPORT_SYMBOL_GPL(disable_hardirq); + void __enable_irq(struct irq_desc *desc, unsigned int irq) { switch (desc->depth) { -- cgit v1.3-8-gc7d7 From bd624d75db21ea5402f9ecf4450b311794d80352 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 13 Feb 2015 08:54:56 +0800 Subject: clockevents: Introduce mode specific callbacks It is not possible for the clockevents core to know which modes (other than those with a corresponding feature flag) are supported by a particular implementation. And drivers are expected to handle transition to all modes elegantly, as ->set_mode() would be issued for them unconditionally. Now, adding support for a new mode complicates things a bit if we want to use the legacy ->set_mode() callback. We need to closely review all clockevents drivers to see if they would break on addition of a new mode. And after such reviews, it is found that we have to do non-trivial changes to most of the drivers [1]. Introduce mode-specific set_mode_*() callbacks, some of which the drivers may or may not implement. A missing callback would clearly convey the message that the corresponding mode isn't supported. A driver may still choose to keep supporting the legacy ->set_mode() callback, but ->set_mode() wouldn't be supporting any new modes beyond RESUME. If a driver wants to benefit from using a new mode, it would be required to migrate to the mode specific callbacks. The legacy ->set_mode() callback and the newly introduced mode-specific callbacks are mutually exclusive. Only one of them should be supported by the driver. Sanity check is done at the time of registration to distinguish between optional and required callbacks and to make error recovery and handling simpler. If the legacy ->set_mode() callback is provided, all mode specific ones would be ignored by the core but a warning is thrown if they are present. Call sites calling ->set_mode() directly are also updated to use __clockevents_set_mode() instead, as ->set_mode() may not be available anymore for few drivers. [1] https://lkml.org/lkml/2014/12/9/605 [2] https://lkml.org/lkml/2015/1/23/255 Suggested-by: Thomas Gleixner [2] Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Daniel Lezcano Cc: Frederic Weisbecker Cc: John Stultz Cc: Kevin Hilman Cc: Linus Torvalds Cc: Preeti U Murthy Cc: linaro-kernel@lists.linaro.org Cc: linaro-networking@linaro.org Link: http://lkml.kernel.org/r/792d59a40423f0acffc9bb0bec9de1341a06fa02.1423788565.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 21 +++++++++-- kernel/time/clockevents.c | 88 ++++++++++++++++++++++++++++++++++++++++++++-- kernel/time/timer_list.c | 32 +++++++++++++++-- 3 files changed, 134 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 2e4cb67f6e56..59af26b54d15 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -39,6 +39,8 @@ enum clock_event_mode { CLOCK_EVT_MODE_PERIODIC, CLOCK_EVT_MODE_ONESHOT, CLOCK_EVT_MODE_RESUME, + + /* Legacy ->set_mode() callback doesn't support below modes */ }; /* @@ -81,7 +83,11 @@ enum clock_event_mode { * @mode: operating mode assigned by the management code * @features: features * @retries: number of forced programming retries - * @set_mode: set mode function + * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. + * @set_mode_periodic: switch mode to periodic, if !set_mode + * @set_mode_oneshot: switch mode to oneshot, if !set_mode + * @set_mode_shutdown: switch mode to shutdown, if !set_mode + * @set_mode_resume: resume clkevt device, if !set_mode * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration @@ -108,9 +114,20 @@ struct clock_event_device { unsigned int features; unsigned long retries; - void (*broadcast)(const struct cpumask *mask); + /* + * Mode transition callback(s): Only one of the two groups should be + * defined: + * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME. + * - set_mode_{shutdown|periodic|oneshot|resume}(). + */ void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); + int (*set_mode_periodic)(struct clock_event_device *); + int (*set_mode_oneshot)(struct clock_event_device *); + int (*set_mode_shutdown)(struct clock_event_device *); + int (*set_mode_resume)(struct clock_event_device *); + + void (*broadcast)(const struct cpumask *mask); void (*suspend)(struct clock_event_device *); void (*resume)(struct clock_event_device *); unsigned long min_delta_ticks; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 55449909f114..489642b08d64 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,6 +94,57 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); +static int __clockevents_set_mode(struct clock_event_device *dev, + enum clock_event_mode mode) +{ + /* Transition with legacy set_mode() callback */ + if (dev->set_mode) { + /* Legacy callback doesn't support new modes */ + if (mode > CLOCK_EVT_MODE_RESUME) + return -ENOSYS; + dev->set_mode(mode, dev); + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* Transition with new mode-specific callbacks */ + switch (mode) { + case CLOCK_EVT_MODE_UNUSED: + /* + * This is an internal state, which is guaranteed to go from + * SHUTDOWN to UNUSED. No driver interaction required. + */ + return 0; + + case CLOCK_EVT_MODE_SHUTDOWN: + return dev->set_mode_shutdown(dev); + + case CLOCK_EVT_MODE_PERIODIC: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) + return -ENOSYS; + return dev->set_mode_periodic(dev); + + case CLOCK_EVT_MODE_ONESHOT: + /* Core internal bug */ + if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) + return -ENOSYS; + return dev->set_mode_oneshot(dev); + + case CLOCK_EVT_MODE_RESUME: + /* Optional callback */ + if (dev->set_mode_resume) + return dev->set_mode_resume(dev); + else + return 0; + + default: + return -ENOSYS; + } +} + /** * clockevents_set_mode - set the operating mode of a clock event device * @dev: device to modify @@ -105,7 +156,9 @@ void clockevents_set_mode(struct clock_event_device *dev, enum clock_event_mode mode) { if (dev->mode != mode) { - dev->set_mode(mode, dev); + if (__clockevents_set_mode(dev, mode)) + return; + dev->mode = mode; /* @@ -373,6 +426,35 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind); +/* Sanity check of mode transition callbacks */ +static int clockevents_sanity_check(struct clock_event_device *dev) +{ + /* Legacy set_mode() callback */ + if (dev->set_mode) { + /* We shouldn't be supporting new modes now */ + WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot || + dev->set_mode_shutdown || dev->set_mode_resume); + return 0; + } + + if (dev->features & CLOCK_EVT_FEAT_DUMMY) + return 0; + + /* New mode-specific callbacks */ + if (!dev->set_mode_shutdown) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && + !dev->set_mode_periodic) + return -EINVAL; + + if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && + !dev->set_mode_oneshot) + return -EINVAL; + + return 0; +} + /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -382,6 +464,8 @@ void clockevents_register_device(struct clock_event_device *dev) unsigned long flags; BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(clockevents_sanity_check(dev)); + if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); @@ -449,7 +533,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) return clockevents_program_event(dev, dev->next_event, false); if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - dev->set_mode(CLOCK_EVT_MODE_PERIODIC, dev); + return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); return 0; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 61ed862cdd37..2cfd19485824 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -228,9 +228,35 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); - SEQ_printf(m, "\n"); + if (dev->set_mode) { + SEQ_printf(m, " set_mode: "); + print_name_offset(m, dev->set_mode); + SEQ_printf(m, "\n"); + } else { + if (dev->set_mode_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_mode_shutdown); + SEQ_printf(m, "\n"); + } + + if (dev->set_mode_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_mode_periodic); + SEQ_printf(m, "\n"); + } + + if (dev->set_mode_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_mode_oneshot); + SEQ_printf(m, "\n"); + } + + if (dev->set_mode_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->set_mode_resume); + SEQ_printf(m, "\n"); + } + } SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); -- cgit v1.3-8-gc7d7 From 095bebf61a460ad7f6a45bb17ddbf3a9df2b4397 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 3 Feb 2015 16:56:48 -0500 Subject: sched/numa: Do not move past the balance point if unbalanced There is a subtle interaction between the logic introduced in commit e63da03639cc ("sched/numa: Allow task switch if load imbalance improves"), the way the load balancer counts the load on each NUMA node, and the way NUMA hinting faults are done. Specifically, the load balancer only counts currently running tasks in the load, while NUMA hinting faults may cause tasks to stop, if the page is locked by another task. This could cause all of the threads of a large single instance workload, like SPECjbb2005, to migrate to the same NUMA node. This was possible because occasionally they all fault on the same few pages, and only one of the threads remains runnable. That thread can move to the process's preferred NUMA node without making the imbalance worse, because nothing else is running at that time. The fix is to check the direction of the net moving of load, and to refuse a NUMA move if it would cause the system to move past the point of balance. In an unbalanced state, only moves that bring us closer to the balance point are allowed. Signed-off-by: Rik van Riel Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: mgorman@suse.de Link: http://lkml.kernel.org/r/20150203165648.0e9ac692@annuminas.surriel.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..28cbacae4e51 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1196,9 +1196,11 @@ static void task_numa_assign(struct task_numa_env *env, static bool load_too_imbalanced(long src_load, long dst_load, struct task_numa_env *env) { - long imb, old_imb; - long orig_src_load, orig_dst_load; long src_capacity, dst_capacity; + long orig_src_load; + long load_a, load_b; + long moved_load; + long imb; /* * The load is corrected for the CPU capacity available on each node. @@ -1211,30 +1213,39 @@ static bool load_too_imbalanced(long src_load, long dst_load, dst_capacity = env->dst_stats.compute_capacity; /* We care about the slope of the imbalance, not the direction. */ - if (dst_load < src_load) - swap(dst_load, src_load); + load_a = dst_load; + load_b = src_load; + if (load_a < load_b) + swap(load_a, load_b); /* Is the difference below the threshold? */ - imb = dst_load * src_capacity * 100 - - src_load * dst_capacity * env->imbalance_pct; + imb = load_a * src_capacity * 100 - + load_b * dst_capacity * env->imbalance_pct; if (imb <= 0) return false; /* * The imbalance is above the allowed threshold. - * Compare it with the old imbalance. + * Allow a move that brings us closer to a balanced situation, + * without moving things past the point of balance. */ orig_src_load = env->src_stats.load; - orig_dst_load = env->dst_stats.load; - if (orig_dst_load < orig_src_load) - swap(orig_dst_load, orig_src_load); - - old_imb = orig_dst_load * src_capacity * 100 - - orig_src_load * dst_capacity * env->imbalance_pct; + /* + * In a task swap, there will be one load moving from src to dst, + * and another moving back. This is the net sum of both moves. + * A simple task move will always have a positive value. + * Allow the move if it brings the system closer to a balanced + * situation, without crossing over the balance point. + */ + moved_load = orig_src_load - src_load; - /* Would this change make things worse? */ - return (imb > old_imb); + if (moved_load > 0) + /* Moving src -> dst. Did we overshoot balance? */ + return src_load * dst_capacity < dst_load * src_capacity; + else + /* Moving dst -> src. Did we overshoot balance? */ + return dst_load * src_capacity < src_load * dst_capacity; } /* -- cgit v1.3-8-gc7d7 From 890a5409f9d0c84d75a1e16eebdfe91d8a57ef1e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 9 Feb 2015 12:30:00 +0100 Subject: sched/numa: Avoid some pointless iterations Commit 81907478c431 ("sched/fair: Avoid using uninitialized variable in preferred_group_nid()") unconditionally initializes max_group with NODE_MASK_NONE, this means that when !max_faults (max_group didn't get set), we'll now continue the iteration with an empty mask. Which in turn makes the actual body of the loop go away, so we'll just iterate until completion; short circuit this by breaking out of the loop as soon as this would happen. Signed-off-by: Jan Beulich Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150209113727.GS5029@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 28cbacae4e51..ee595ef30470 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1774,6 +1774,8 @@ static int preferred_group_nid(struct task_struct *p, int nid) } } /* Next round, evaluate the nodes within max_group. */ + if (!max_faults) + break; nodes = max_group; } return nid; -- cgit v1.3-8-gc7d7 From 07d2413a61db6500f58e614e873eed79d7f2ed72 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 2 Feb 2015 13:59:26 -0800 Subject: locking/mutex: In mutex_spin_on_owner(), return true when owner changes In the mutex_spin_on_owner(), we return true only if lock->owner == NULL. This was beneficial in situations where there were multiple threads simultaneously spinning for the mutex. If another thread got the lock while other spinner(s) were also doing mutex_spin_on_owner(), then the other spinners would stop spinning. This workaround helped reduce the chance that many spinners were simultaneously spinning for the mutex which can help reduce contention in highly contended cases. However, recent changes were made to the optimistic spinning code such that instead of having all spinners simultaneously spin for the mutex, we queue the spinners with an MCS lock such that only one thread spins for the mutex at a time. Furthermore, the OSQ optimizations ensure that spinners in the queue will stop waiting if it needs to reschedule. Now, we don't have to worry about multiple threads spinning on owner at the same time, and if lock->owner is not NULL at this point, it likely means another thread happens to obtain the lock in the fastpath. In this case, it would make sense for the spinner to continue spinning as long as the spinner doesn't need to schedule and the mutex owner is running. This patch changes this so that mutex_spin_on_owner() returns true when the lock owner changes, which means a thread will only stop spinning if it either needs to reschedule or if the lock owner is not running. We saw up to a 5% performance improvement in the fserver workload with this patch. Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Acked-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Tim Chen Cc: chegu_vinod@hp.com Cc: tglx@linutronix.de Link: http://lkml.kernel.org/r/1422914367-5574-2-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 94674e5919cb..49cce442f3ff 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -250,11 +250,11 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) rcu_read_unlock(); /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. + * We break out of the loop above on either need_resched(), when + * the owner is not running, or when the lock owner changed. + * Return success only when the lock owner changed. */ - return lock->owner == NULL; + return lock->owner != owner; } /* -- cgit v1.3-8-gc7d7 From be1f7bf217ebb1e42190d7d0b332c89ea7871378 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Mon, 2 Feb 2015 13:59:27 -0800 Subject: locking/mutex: Refactor mutex_spin_on_owner() As suggested by Davidlohr, we could refactor mutex_spin_on_owner(). Currently, we split up owner_running() with mutex_spin_on_owner(). When the owner changes, we make duplicate owner checks which are not necessary. It also makes the code a bit obscure as we are using a second check to figure out why we broke out of the loop. This patch modifies it such that we remove the owner_running() function and the mutex_spin_on_owner() loop directly checks for if the owner changes, if the owner is not running, or if we need to reschedule. If the owner changes, we break out of the loop and return true. If the owner is not running or if we need to reschedule, then break out of the loop and return false. Suggested-by: Davidlohr Bueso Signed-off-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Cc: Aswin Chandramouleeswaran Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Tim Chen Cc: chegu_vinod@hp.com Cc: tglx@linutronix.de Link: http://lkml.kernel.org/r/1422914367-5574-3-git-send-email-jason.low2@hp.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 49cce442f3ff..59cd6c30421e 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -217,44 +217,41 @@ ww_mutex_set_context_slowpath(struct ww_mutex *lock, } #ifdef CONFIG_MUTEX_SPIN_ON_OWNER -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - /* * Look out! "owner" is an entirely speculative pointer * access and not reliable. */ static noinline -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) { + bool ret; + rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) + while (true) { + /* Return success when the lock owner changed */ + if (lock->owner != owner) { + ret = true; break; + } + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking lock->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + if (!owner->on_cpu || need_resched()) { + ret = false; + break; + } cpu_relax_lowlatency(); } rcu_read_unlock(); - /* - * We break out of the loop above on either need_resched(), when - * the owner is not running, or when the lock owner changed. - * Return success only when the lock owner changed. - */ - return lock->owner != owner; + return ret; } /* -- cgit v1.3-8-gc7d7 From a21294644623ee41034db60e93aaebed4db0e57b Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 2 Feb 2015 15:05:36 +0100 Subject: locking/futex: Check PF_KTHREAD rather than !p->mm to filter out kthreads attach_to_pi_owner() checks p->mm to prevent attaching to kthreads and this looks doubly wrong: 1. It should actually check PF_KTHREAD, kthread can do use_mm(). 2. If this task is not kthread and it is actually the lock owner we can wrongly return -EPERM instead of -ESRCH or retry-if-EAGAIN. And note that this wrong EPERM is the likely case unless the exiting task is (auto)reaped quickly, we check ->mm before PF_EXITING. Signed-off-by: Oleg Nesterov Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Darren Hart Cc: Davidlohr Bueso Cc: Jerome Marchand Cc: Larry Woodman Cc: Linus Torvalds Cc: Mateusz Guzik Link: http://lkml.kernel.org/r/20150202140536.GA26406@redhat.com Signed-off-by: Ingo Molnar --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 4eeb63de7e54..1f6d646eee4a 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -900,7 +900,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, if (!p) return -ESRCH; - if (!p->mm) { + if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); return -EPERM; } -- cgit v1.3-8-gc7d7 From 49e4b2bcf7b812e985e65b6c8a0255b1520a6e7e Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 30 Jan 2015 01:14:24 -0800 Subject: locking/rwsem: Document barrier need when waking tasks The need for the smp_mb() in __rwsem_do_wake() should be properly documented. Applies to both xadd and spinlock variants. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Jason Low Cc: Linus Torvalds Cc: Michel Lespinasse Cc: Paul E. McKenney Cc: Tim Chen Link: http://lkml.kernel.org/r/1422609267-15102-3-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-spinlock.c | 7 +++++++ kernel/locking/rwsem-xadd.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c index 2555ae15ec14..3a5048572065 100644 --- a/kernel/locking/rwsem-spinlock.c +++ b/kernel/locking/rwsem-spinlock.c @@ -85,6 +85,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, int wakewrite) list_del(&waiter->list); tsk = waiter->task; + /* + * Make sure we do not wakeup the next reader before + * setting the nil condition to grant the next reader; + * otherwise we could miss the wakeup on the other + * side and end up sleeping again. See the pairing + * in rwsem_down_read_failed(). + */ smp_mb(); waiter->task = NULL; wake_up_process(tsk); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2f7cc4076f50..82aba467564a 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -186,6 +186,13 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type) waiter = list_entry(next, struct rwsem_waiter, list); next = waiter->list.next; tsk = waiter->task; + /* + * Make sure we do not wakeup the next reader before + * setting the nil condition to grant the next reader; + * otherwise we could miss the wakeup on the other + * side and end up sleeping again. See the pairing + * in rwsem_down_read_failed(). + */ smp_mb(); waiter->task = NULL; wake_up_process(tsk); -- cgit v1.3-8-gc7d7 From 7a215f89a0335582292ec6f3edaa3abd570da75a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 30 Jan 2015 01:14:25 -0800 Subject: locking/rwsem: Set lock ownership ASAP In order to optimize the spinning step, we need to set the lock owner as soon as the lock is acquired; after a successful counter cmpxchg operation, that is. This is particularly useful as rwsems need to set the owner to nil for readers, so there is a greater chance of falling out of the spinning. Currently we only set the owner much later in the game, in the more generic level -- latency can be specially bad when waiting for a node->next pointer when releasing the osq in up_write calls. As such, update the owner inside rwsem_try_write_lock (when the lock is obtained after blocking) and rwsem_try_write_lock_unqueued (when the lock is obtained while spinning). This requires creating a new internal rwsem.h header to share the owner related calls. Also cleanup some headers for mutex and rwsem. Suggested-by: Peter Zijlstra Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Jason Low Cc: Linus Torvalds Cc: Michel Lespinasse Cc: Paul E. McKenney Cc: Tim Chen Link: http://lkml.kernel.org/r/1422609267-15102-4-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 2 +- kernel/locking/rwsem-xadd.c | 8 ++++++-- kernel/locking/rwsem.c | 22 +--------------------- kernel/locking/rwsem.h | 20 ++++++++++++++++++++ 4 files changed, 28 insertions(+), 24 deletions(-) create mode 100644 kernel/locking/rwsem.h (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 59cd6c30421e..43bf25ef3c81 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -25,7 +25,7 @@ #include #include #include -#include "mcs_spinlock.h" +#include /* * In the DEBUG case we are using the "NULL fastpath" for mutexes, diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 82aba467564a..07713e5d9713 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -14,8 +14,9 @@ #include #include #include +#include -#include "mcs_spinlock.h" +#include "rwsem.h" /* * Guide to the rw_semaphore's count field for common values. @@ -265,6 +266,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) { if (!list_is_singular(&sem->wait_list)) rwsem_atomic_update(RWSEM_WAITING_BIAS, sem); + rwsem_set_owner(sem); return true; } @@ -284,8 +286,10 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) return false; old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS); - if (old == count) + if (old == count) { + rwsem_set_owner(sem); return true; + } count = old; } diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index e2d3bc7f03b4..205be0ce34de 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -9,29 +9,9 @@ #include #include #include - #include -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ - sem->owner = current; -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ - sem->owner = NULL; -} - -#else -static inline void rwsem_set_owner(struct rw_semaphore *sem) -{ -} - -static inline void rwsem_clear_owner(struct rw_semaphore *sem) -{ -} -#endif +#include "rwsem.h" /* * lock for reading diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h new file mode 100644 index 000000000000..870ed9a5b426 --- /dev/null +++ b/kernel/locking/rwsem.h @@ -0,0 +1,20 @@ +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ + sem->owner = current; +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ + sem->owner = NULL; +} + +#else +static inline void rwsem_set_owner(struct rw_semaphore *sem) +{ +} + +static inline void rwsem_clear_owner(struct rw_semaphore *sem) +{ +} +#endif -- cgit v1.3-8-gc7d7 From b3fd4f03ca0b9952221f39ae6790e698bf4b39e7 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 30 Jan 2015 01:14:26 -0800 Subject: locking/rwsem: Avoid deceiving lock spinners When readers hold the semaphore, the ->owner is nil. As such, and unlike mutexes, '!owner' does not necessarily imply that the lock is free. This will cause writers to potentially spin excessively as they've been mislead to thinking they have a chance of acquiring the lock, instead of blocking. This patch therefore enhances the counter check when the owner is not set by the time we've broken out of the loop. Otherwise we can return true as a new owner has the lock and thus we want to continue spinning. While at it, we can make rwsem_spin_on_owner() less ambiguos and return right away under need_resched conditions. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Cc: Jason Low Cc: Linus Torvalds Cc: Michel Lespinasse Cc: Paul E. McKenney Cc: Tim Chen Link: http://lkml.kernel.org/r/1422609267-15102-5-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 07713e5d9713..1c0d11e8ce34 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -337,21 +337,30 @@ static inline bool owner_running(struct rw_semaphore *sem, static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { + long count; + rcu_read_lock(); while (owner_running(sem, owner)) { - if (need_resched()) - break; + /* abort spinning when need_resched */ + if (need_resched()) { + rcu_read_unlock(); + return false; + } cpu_relax_lowlatency(); } rcu_read_unlock(); + if (READ_ONCE(sem->owner)) + return true; /* new owner, continue spinning */ + /* - * We break out the loop above on need_resched() or when the - * owner changed, which is a sign for heavy contention. Return - * success only when sem->owner is NULL. + * When the owner is not set, the lock could be free or + * held by readers. Check the counter to verify the + * state. */ - return sem->owner == NULL; + count = READ_ONCE(sem->count); + return (count == 0 || count == RWSEM_WAITING_BIAS); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) -- cgit v1.3-8-gc7d7 From 1a99367023f6ac664365a37fa508b059e31d0e88 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Fri, 30 Jan 2015 01:14:27 -0800 Subject: locking/rwsem: Check for active lock before bailing on spinning 37e9562453b ("locking/rwsem: Allow conservative optimistic spinning when readers have lock") forced the default for optimistic spinning to be disabled if the lock owner was nil, which makes much sense for readers. However, while it is not our priority, we can make some optimizations for write-mostly workloads. We can bail the spinning step and still be conservative if there are any active tasks, otherwise there's really no reason not to spin, as the semaphore is most likely unlocked. This patch recovers most of a Unixbench 'execl' benchmark throughput by sleeping less and making better average system usage: before: CPU %user %nice %system %iowait %steal %idle all 0.60 0.00 8.02 0.00 0.00 91.38 after: CPU %user %nice %system %iowait %steal %idle all 1.22 0.00 70.18 0.00 0.00 28.60 Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Jason Low Cc: Linus Torvalds Cc: Michel Lespinasse Cc: Paul E. McKenney Cc: Tim Chen Link: http://lkml.kernel.org/r/1422609267-15102-6-git-send-email-dave@stgolabs.net Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 1c0d11e8ce34..e4ad019e23f5 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -298,23 +298,30 @@ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner; - bool on_cpu = false; + bool ret = true; if (need_resched()) return false; rcu_read_lock(); owner = ACCESS_ONCE(sem->owner); - if (owner) - on_cpu = owner->on_cpu; - rcu_read_unlock(); + if (!owner) { + long count = ACCESS_ONCE(sem->count); + /* + * If sem->owner is not set, yet we have just recently entered the + * slowpath with the lock being active, then there is a possibility + * reader(s) may have the lock. To be safe, bail spinning in these + * situations. + */ + if (count & RWSEM_ACTIVE_MASK) + ret = false; + goto done; + } - /* - * If sem->owner is not set, yet we have just recently entered the - * slowpath, then there is a possibility reader(s) may have the lock. - * To be safe, avoid spinning in these situations. - */ - return on_cpu; + ret = owner->on_cpu; +done: + rcu_read_unlock(); + return ret; } static inline bool owner_running(struct rw_semaphore *sem, -- cgit v1.3-8-gc7d7 From 72f669c0086fbbbbebc92ce7390125722c4c0ec5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 5 Feb 2015 15:55:31 -0800 Subject: perf: Update shadow timestamp before add event Update the shadow timestamp before start event, because .add might use the timestamp. Signed-off-by: Shaohua Li Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Linus Torvalds Cc: Paul Mackerras Link: http://lkml.kernel.org/r/9cd0276d6a047cb7c2885994f25e3a1f7c8c28af.1423180257.git.shli@fb.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 13209a90b751..e580e0f41ac6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1881,6 +1881,10 @@ event_sched_in(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_running += tstamp - event->tstamp_stopped; + + perf_set_shadow_time(event, ctx, tstamp); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -1888,10 +1892,6 @@ event_sched_in(struct perf_event *event, goto out; } - event->tstamp_running += tstamp - event->tstamp_stopped; - - perf_set_shadow_time(event, ctx, tstamp); - if (!is_software_event(event)) cpuctx->active_oncpu++; if (!ctx->nr_active++) -- cgit v1.3-8-gc7d7 From 6a694a607a97d58c042fb7fbd60ef1caea26950c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 5 Feb 2015 15:55:32 -0800 Subject: perf: Update userspace page info for software event For hardware events, the userspace page of the event gets updated in context switches, so if we read the timestamp in the page, we get fresh info. For software events, this is missing currently. This patch makes the behavior consistent. With this patch, we can implement clock_gettime(THREAD_CPUTIME) with PERF_COUNT_SW_DUMMY in userspace as suggested by Andy and Peter. Code like this: if (pc->cap_user_time) { do { seq = pc->lock; barrier(); running = pc->time_running; cyc = rdtsc(); time_mult = pc->time_mult; time_shift = pc->time_shift; time_offset = pc->time_offset; barrier(); } while (pc->lock != seq); quot = (cyc >> time_shift); rem = cyc & ((1 << time_shift) - 1); delta = time_offset + quot * time_mult + ((rem * time_mult) >> time_shift); running += delta; return running; } I tried it on a busy system, the userspace page updating doesn't have noticeable overhead. Signed-off-by: Shaohua Li Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Linus Torvalds Link: http://lkml.kernel.org/r/aa2dd2e4f1e9f2225758be5ba00f14d6909a8ce1.1423180257.git.shli@fb.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index e580e0f41ac6..fef45b4bb5f8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6123,6 +6123,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) } hlist_add_head_rcu(&event->hlist_entry, head); + perf_event_update_userpage(event); return 0; } @@ -6592,6 +6593,7 @@ static int cpu_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) cpu_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } @@ -6666,6 +6668,7 @@ static int task_clock_event_add(struct perf_event *event, int flags) { if (flags & PERF_EF_START) task_clock_event_start(event, flags); + perf_event_update_userpage(event); return 0; } -- cgit v1.3-8-gc7d7 From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:55:58 -0500 Subject: perf: Introduce pmu context switch callback The callback is invoked when process is scheduled in or out. It provides mechanism for later patches to save/store the LBR stack. For the schedule in case, the callback is invoked at the same place that flush branch stack callback is invoked. So it also can replace the flush branch stack callback. To avoid unnecessary overhead, the callback is enabled only when there are events use the LBR stack. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++++ arch/x86/kernel/cpu/perf_event.h | 2 ++ include/linux/perf_event.h | 9 +++++++ kernel/events/core.c | 57 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..0efbd6cc2966 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1914,6 +1914,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = { NULL, }; +static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + if (x86_pmu.sched_task) + x86_pmu.sched_task(ctx, sched_in); +} + static void x86_pmu_flush_branch_stack(void) { if (x86_pmu.flush_branch_stack) @@ -1950,6 +1956,7 @@ static struct pmu pmu = { .event_idx = x86_pmu_event_idx, .flush_branch_stack = x86_pmu_flush_branch_stack, + .sched_task = x86_pmu_sched_task, }; void arch_perf_update_userpage(struct perf_event *event, diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 0c45b22495dc..211b54c0c00c 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -473,6 +473,8 @@ struct x86_pmu { void (*check_microcode)(void); void (*flush_branch_stack)(void); + void (*sched_task)(struct perf_event_context *ctx, + bool sched_in); /* * Intel Arch Perfmon v2+ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 33262004c310..fbab6235d053 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -265,6 +265,13 @@ struct pmu { * flush branch stack on context-switches (needed in cpu-wide mode) */ void (*flush_branch_stack) (void); + + /* + * context-switches callback + */ + void (*sched_task) (struct perf_event_context *ctx, + bool sched_in); + }; /** @@ -558,6 +565,8 @@ extern void perf_event_delayed_put(struct task_struct *task); extern void perf_event_print_debug(void); extern void perf_pmu_disable(struct pmu *pmu); extern void perf_pmu_enable(struct pmu *pmu); +extern void perf_sched_cb_dec(struct pmu *pmu); +extern void perf_sched_cb_inc(struct pmu *pmu); extern int perf_event_task_disable(void); extern int perf_event_task_enable(void); extern int perf_event_refresh(struct perf_event *event, int refresh); diff --git a/kernel/events/core.c b/kernel/events/core.c index fef45b4bb5f8..6c8b31b7efb6 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -154,6 +154,7 @@ enum event_type_t { struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); +static DEFINE_PER_CPU(int, perf_sched_cb_usages); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -2577,6 +2578,56 @@ unlock: } } +void perf_sched_cb_dec(struct pmu *pmu) +{ + this_cpu_dec(perf_sched_cb_usages); +} + +void perf_sched_cb_inc(struct pmu *pmu) +{ + this_cpu_inc(perf_sched_cb_usages); +} + +/* + * This function provides the context switch callback to the lower code + * layer. It is invoked ONLY when the context switch callback is enabled. + */ +static void perf_pmu_sched_task(struct task_struct *prev, + struct task_struct *next, + bool sched_in) +{ + struct perf_cpu_context *cpuctx; + struct pmu *pmu; + unsigned long flags; + + if (prev == next) + return; + + local_irq_save(flags); + + rcu_read_lock(); + + list_for_each_entry_rcu(pmu, &pmus, entry) { + if (pmu->sched_task) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + perf_ctx_lock(cpuctx, cpuctx->task_ctx); + + perf_pmu_disable(pmu); + + pmu->sched_task(cpuctx->task_ctx, sched_in); + + perf_pmu_enable(pmu); + + perf_ctx_unlock(cpuctx, cpuctx->task_ctx); + } + } + + rcu_read_unlock(); + + local_irq_restore(flags); +} + #define for_each_task_context_nr(ctxn) \ for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) @@ -2596,6 +2647,9 @@ void __perf_event_task_sched_out(struct task_struct *task, { int ctxn; + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(task, next, false); + for_each_task_context_nr(ctxn) perf_event_context_sched_out(task, ctxn, next); @@ -2847,6 +2901,9 @@ void __perf_event_task_sched_in(struct task_struct *prev, /* check for system-wide branch_stack events */ if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) perf_branch_stack_sched_in(prev, task); + + if (__this_cpu_read(perf_sched_cb_usages)) + perf_pmu_sched_task(prev, task, true); } static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count) -- cgit v1.3-8-gc7d7 From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:55:59 -0500 Subject: perf/x86/intel: Use context switch callback to flush LBR stack Previous commit introduces context switch callback, its function overlaps with the flush branch stack callback. So we can use the context switch callback to flush LBR stack. This patch adds code that uses the flush branch callback to flush the LBR stack when task is being scheduled in. The callback is enabled only when there are events use the LBR hardware. This patch also removes all old flush branch stack code. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vince Weaver Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 --- arch/x86/kernel/cpu/perf_event.h | 3 +- arch/x86/kernel/cpu/perf_event_intel.c | 14 +----- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 27 +++++++++++ include/linux/perf_event.h | 1 - kernel/events/core.c | 77 ------------------------------ 6 files changed, 30 insertions(+), 99 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0efbd6cc2966..6b1fd26a37cf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1920,12 +1920,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in) x86_pmu.sched_task(ctx, sched_in); } -static void x86_pmu_flush_branch_stack(void) -{ - if (x86_pmu.flush_branch_stack) - x86_pmu.flush_branch_stack(); -} - void perf_check_microcode(void) { if (x86_pmu.check_microcode) @@ -1955,7 +1949,6 @@ static struct pmu pmu = { .commit_txn = x86_pmu_commit_txn, .event_idx = x86_pmu_event_idx, - .flush_branch_stack = x86_pmu_flush_branch_stack, .sched_task = x86_pmu_sched_task, }; diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 211b54c0c00c..949d0083a29e 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -472,7 +472,6 @@ struct x86_pmu { void (*cpu_dead)(int cpu); void (*check_microcode)(void); - void (*flush_branch_stack)(void); void (*sched_task)(struct perf_event_context *ctx, bool sched_in); @@ -733,6 +732,8 @@ void intel_pmu_pebs_disable_all(void); void intel_ds_init(void); +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); + void intel_pmu_lbr_reset(void); void intel_pmu_lbr_enable(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 498b6d967138..424fbf74dee7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2044,18 +2044,6 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } -static void intel_pmu_flush_branch_stack(void) -{ - /* - * Intel LBR does not tag entries with the - * PID of the current task, then we need to - * flush it on ctxsw - * For now, we simply reset it - */ - if (x86_pmu.lbr_nr) - intel_pmu_lbr_reset(); -} - PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -2107,7 +2095,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, - .flush_branch_stack = intel_pmu_flush_branch_stack, + .sched_task = intel_pmu_lbr_sched_task, }; static __init void intel_clovertown_quirk(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 8bc078f43a82..c0e23c5f85bb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -177,6 +177,31 @@ void intel_pmu_lbr_reset(void) intel_pmu_lbr_reset_64(); } +void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + /* + * When sampling the branck stack in system-wide, it may be + * necessary to flush the stack on context switch. This happens + * when the branch stack does not tag its entries with the pid + * of the current task. Otherwise it becomes impossible to + * associate a branch entry with a task. This ambiguity is more + * likely to appear when the branch stack supports priv level + * filtering and the user sets it to monitor only at the user + * level (which could be a useful measurement in system-wide + * mode). In that case, the risk is high of having a branch + * stack with branch from multiple tasks. + */ + if (sched_in) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = ctx; + } +} + void intel_pmu_lbr_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -195,6 +220,7 @@ void intel_pmu_lbr_enable(struct perf_event *event) cpuc->br_sel = event->hw.branch_reg.reg; cpuc->lbr_users++; + perf_sched_cb_inc(event->ctx->pmu); } void intel_pmu_lbr_disable(struct perf_event *event) @@ -206,6 +232,7 @@ void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); + perf_sched_cb_dec(event->ctx->pmu); if (cpuc->enabled && !cpuc->lbr_users) { __intel_pmu_lbr_disable(); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index fbab6235d053..c7007a564440 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -511,7 +511,6 @@ struct perf_event_context { u64 generation; int pin_count; int nr_cgroups; /* cgroup evts */ - int nr_branch_stack; /* branch_stack evt */ struct rcu_head rcu_head; struct delayed_work orphans_remove; diff --git a/kernel/events/core.c b/kernel/events/core.c index 6c8b31b7efb6..f563ce767f93 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -153,7 +153,6 @@ enum event_type_t { */ struct static_key_deferred perf_sched_events __read_mostly; static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); -static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); static atomic_t nr_mmap_events __read_mostly; @@ -1240,9 +1239,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) ctx->nr_cgroups++; - if (has_branch_stack(event)) - ctx->nr_branch_stack++; - list_add_rcu(&event->event_entry, &ctx->event_list); ctx->nr_events++; if (event->attr.inherit_stat) @@ -1409,9 +1405,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) cpuctx->cgrp = NULL; } - if (has_branch_stack(event)) - ctx->nr_branch_stack--; - ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; @@ -2808,64 +2801,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, perf_ctx_unlock(cpuctx, ctx); } -/* - * When sampling the branck stack in system-wide, it may be necessary - * to flush the stack on context switch. This happens when the branch - * stack does not tag its entries with the pid of the current task. - * Otherwise it becomes impossible to associate a branch entry with a - * task. This ambiguity is more likely to appear when the branch stack - * supports priv level filtering and the user sets it to monitor only - * at the user level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch stack with - * branch from multiple tasks. Flushing may mean dropping the existing - * entries or stashing them somewhere in the PMU specific code layer. - * - * This function provides the context switch callback to the lower code - * layer. It is invoked ONLY when there is at least one system-wide context - * with at least one active event using taken branch sampling. - */ -static void perf_branch_stack_sched_in(struct task_struct *prev, - struct task_struct *task) -{ - struct perf_cpu_context *cpuctx; - struct pmu *pmu; - unsigned long flags; - - /* no need to flush branch stack if not changing task */ - if (prev == task) - return; - - local_irq_save(flags); - - rcu_read_lock(); - - list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - /* - * check if the context has at least one - * event using PERF_SAMPLE_BRANCH_STACK - */ - if (cpuctx->ctx.nr_branch_stack > 0 - && pmu->flush_branch_stack) { - - perf_ctx_lock(cpuctx, cpuctx->task_ctx); - - perf_pmu_disable(pmu); - - pmu->flush_branch_stack(); - - perf_pmu_enable(pmu); - - perf_ctx_unlock(cpuctx, cpuctx->task_ctx); - } - } - - rcu_read_unlock(); - - local_irq_restore(flags); -} - /* * Called from scheduler to add the events of the current task * with interrupts disabled. @@ -2898,10 +2833,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) perf_cgroup_sched_in(prev, task); - /* check for system-wide branch_stack events */ - if (atomic_read(this_cpu_ptr(&perf_branch_stack_events))) - perf_branch_stack_sched_in(prev, task); - if (__this_cpu_read(perf_sched_cb_usages)) perf_pmu_sched_task(prev, task, true); } @@ -3480,10 +3411,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_dec(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } @@ -7139,10 +7066,6 @@ static void account_event_cpu(struct perf_event *event, int cpu) if (event->parent) return; - if (has_branch_stack(event)) { - if (!(event->attach_state & PERF_ATTACH_TASK)) - atomic_inc(&per_cpu(perf_branch_stack_events, cpu)); - } if (is_cgroup_event(event)) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } -- cgit v1.3-8-gc7d7 From 4af57ef28c2c1047fda9e1a5be02aa7a6a69cf9e Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:01 -0500 Subject: perf: Add pmu specific data for perf task context Introduce a new flag PERF_ATTACH_TASK_DATA for perf event's attach stata. The flag is set by PMU's event_init() callback, it indicates that perf event needs PMU specific data. The PMU specific data are initialized to zeros. Later patches will use PMU specific data to save LBR stack. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-6-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 40 ++++++++++++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index c7007a564440..270cd0173e61 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -271,6 +271,10 @@ struct pmu { */ void (*sched_task) (struct perf_event_context *ctx, bool sched_in); + /* + * PMU specific data size + */ + size_t task_ctx_size; }; @@ -307,6 +311,7 @@ struct swevent_hlist { #define PERF_ATTACH_CONTEXT 0x01 #define PERF_ATTACH_GROUP 0x02 #define PERF_ATTACH_TASK 0x04 +#define PERF_ATTACH_TASK_DATA 0x08 struct perf_cgroup; struct ring_buffer; @@ -511,6 +516,7 @@ struct perf_event_context { u64 generation; int pin_count; int nr_cgroups; /* cgroup evts */ + void *task_ctx_data; /* pmu specific data */ struct rcu_head rcu_head; struct delayed_work orphans_remove; diff --git a/kernel/events/core.c b/kernel/events/core.c index f563ce767f93..688086bb7144 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -905,6 +905,15 @@ static void get_ctx(struct perf_event_context *ctx) WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); } +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx->task_ctx_data); + kfree(ctx); +} + static void put_ctx(struct perf_event_context *ctx) { if (atomic_dec_and_test(&ctx->refcount)) { @@ -912,7 +921,7 @@ static void put_ctx(struct perf_event_context *ctx) put_ctx(ctx->parent_ctx); if (ctx->task) put_task_struct(ctx->task); - kfree_rcu(ctx, rcu_head); + call_rcu(&ctx->rcu_head, free_ctx); } } @@ -3309,12 +3318,15 @@ errout: * Returns a matching context with refcount and pincount. */ static struct perf_event_context * -find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) +find_get_context(struct pmu *pmu, struct task_struct *task, + struct perf_event *event) { struct perf_event_context *ctx, *clone_ctx = NULL; struct perf_cpu_context *cpuctx; + void *task_ctx_data = NULL; unsigned long flags; int ctxn, err; + int cpu = event->cpu; if (!task) { /* Must be root to operate on a CPU event: */ @@ -3342,11 +3354,24 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) if (ctxn < 0) goto errout; + if (event->attach_state & PERF_ATTACH_TASK_DATA) { + task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL); + if (!task_ctx_data) { + err = -ENOMEM; + goto errout; + } + } + retry: ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { clone_ctx = unclone_ctx(ctx); ++ctx->pin_count; + + if (task_ctx_data && !ctx->task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } raw_spin_unlock_irqrestore(&ctx->lock, flags); if (clone_ctx) @@ -3357,6 +3382,11 @@ retry: if (!ctx) goto errout; + if (task_ctx_data) { + ctx->task_ctx_data = task_ctx_data; + task_ctx_data = NULL; + } + err = 0; mutex_lock(&task->perf_event_mutex); /* @@ -3383,9 +3413,11 @@ retry: } } + kfree(task_ctx_data); return ctx; errout: + kfree(task_ctx_data); return ERR_PTR(err); } @@ -7559,7 +7591,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, task, event->cpu); + ctx = find_get_context(pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -7765,7 +7797,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, account_event(event); - ctx = find_get_context(event->pmu, task, cpu); + ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; -- cgit v1.3-8-gc7d7 From 5a158c3ccd2183a7b0866be6685d001fe653430f Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:02 -0500 Subject: perf: Always switch pmu specific data during context switch If two tasks were both forked from the same parent task, Events in their perf task contexts can be the same. Perf core may leave out switching the perf event contexts. Previous patch inroduces pmu specific data. The data is for saving the LBR stack, it is task specific. So we need to switch the data even when context switch is optimized out. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-7-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 688086bb7144..84451c0debba 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2562,6 +2562,9 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; + + swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); -- cgit v1.3-8-gc7d7 From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 4 Nov 2014 21:56:06 -0500 Subject: perf: Simplify the branch stack check Use event->attr.branch_sample_type to replace intel_pmu_needs_lbr_smpl() for avoiding duplicated code that implicitly enables the LBR. Currently, branch stack can be enabled by user explicitly requesting branch sampling or implicit branch sampling to correct PEBS skid. For user explicitly requested branch sampling, the branch_sample_type is explicitly set by user. For PEBS case, the branch_sample_type is also implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config. Signed-off-by: Yan, Zheng Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Paul Mackerras Cc: eranian@google.com Cc: jolsa@redhat.com Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 20 +++----------------- include/linux/perf_event.h | 5 +++++ kernel/events/core.c | 3 +++ 3 files changed, 11 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a485ba124476..9f1dd18fa395 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1029,20 +1029,6 @@ static __initconst const u64 slm_hw_cache_event_ids }, }; -static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event) -{ - /* user explicitly requested branch sampling */ - if (has_branch_stack(event)) - return true; - - /* implicit branch sampling to correct PEBS skid */ - if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 && - x86_pmu.intel_cap.pebs_format < 2) - return true; - - return false; -} - static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); @@ -1207,7 +1193,7 @@ static void intel_pmu_disable_event(struct perf_event *event) * must disable before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_disable(event); if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { @@ -1268,7 +1254,7 @@ static void intel_pmu_enable_event(struct perf_event *event) * must enabled before any actual event * because any event may be combined with LBR */ - if (intel_pmu_needs_lbr_smpl(event)) + if (needs_branch_stack(event)) intel_pmu_lbr_enable(event); if (event->attr.exclude_host) @@ -1747,7 +1733,7 @@ static int intel_pmu_hw_config(struct perf_event *event) if (event->attr.precise_ip && x86_pmu.pebs_aliases) x86_pmu.pebs_aliases(event); - if (intel_pmu_needs_lbr_smpl(event)) { + if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); if (ret) return ret; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 270cd0173e61..43cc158487e6 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -814,6 +814,11 @@ static inline bool has_branch_stack(struct perf_event *event) return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK; } +static inline bool needs_branch_stack(struct perf_event *event) +{ + return event->attr.branch_sample_type != 0; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/kernel/events/core.c b/kernel/events/core.c index 84451c0debba..257eccf9afd4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7232,6 +7232,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto err_ns; + if (!has_branch_stack(event)) + event->attr.branch_sample_type = 0; + pmu = perf_init_event(event); if (!pmu) goto err_ns; -- cgit v1.3-8-gc7d7 From 1e78cdbd9b2266503339accafe0ebdd99b93a531 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 16 Feb 2015 15:23:49 -0500 Subject: sched/rt/nohz: Stop scheduler tick if running realtime task If the CPU is running a realtime task that does not round-robin with another realtime task of equal priority, there is no point in keeping the scheduler tick going. After all, whenever the scheduler tick runs, the kernel will just decide not to reschedule. Extend sched_can_stop_tick() to recognize these situations, and inform the rest of the kernel that the scheduler tick can be stopped. Tested-by: Luiz Capitulino Signed-off-by: Rik van Riel Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: fweisbec@redhat.com Cc: mtosatti@redhat.com Link: http://lkml.kernel.org/r/20150216152349.6a8ed824@annuminas.surriel.com [ Small cleanliness tweak. ] Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a4869bd426ca..97fe79cf613e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -689,6 +689,23 @@ static inline bool got_nohz_idle_kick(void) #ifdef CONFIG_NO_HZ_FULL bool sched_can_stop_tick(void) { + /* + * FIFO realtime policy runs the highest priority task. Other runnable + * tasks are of a lower priority. The scheduler tick does nothing. + */ + if (current->policy == SCHED_FIFO) + return true; + + /* + * Round-robin realtime tasks time slice with other tasks at the same + * realtime priority. Is this task the only one at this priority? + */ + if (current->policy == SCHED_RR) { + struct sched_rt_entity *rt_se = ¤t->rt; + + return rt_se->run_list.prev == rt_se->run_list.next; + } + /* * More than one running task need preemption. * nr_running update is assumed to be visible -- cgit v1.3-8-gc7d7 From 0937e3b025f70e33f018aa55ee8d32b8731730a7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 9 Feb 2015 11:31:13 -0600 Subject: livepatch: simplify disable error path If registering the function with ftrace has previously succeeded, unregistering will almost never fail. Even if it does, it's not a fatal error. We can still carry on and disable the klp_func from being used by removing it from the klp_ops func stack. Signed-off-by: Josh Poimboeuf Reviewed-by: Miroslav Benes Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 67 +++++++++++++------------------------------------ 1 file changed, 17 insertions(+), 50 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index ff7f47d026ac..26df09d56f7c 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -322,32 +322,20 @@ static void notrace klp_ftrace_handler(unsigned long ip, klp_arch_set_pc(regs, (unsigned long)func->new_func); } -static int klp_disable_func(struct klp_func *func) +static void klp_disable_func(struct klp_func *func) { struct klp_ops *ops; - int ret; - - if (WARN_ON(func->state != KLP_ENABLED)) - return -EINVAL; - if (WARN_ON(!func->old_addr)) - return -EINVAL; + WARN_ON(func->state != KLP_ENABLED); + WARN_ON(!func->old_addr); ops = klp_find_ops(func->old_addr); if (WARN_ON(!ops)) - return -EINVAL; + return; if (list_is_singular(&ops->func_stack)) { - ret = unregister_ftrace_function(&ops->fops); - if (ret) { - pr_err("failed to unregister ftrace handler for function '%s' (%d)\n", - func->old_name, ret); - return ret; - } - - ret = ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0); - if (ret) - pr_warn("function unregister succeeded but failed to clear the filter\n"); + WARN_ON(unregister_ftrace_function(&ops->fops)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, func->old_addr, 1, 0)); list_del_rcu(&func->stack_node); list_del(&ops->node); @@ -357,8 +345,6 @@ static int klp_disable_func(struct klp_func *func) } func->state = KLP_DISABLED; - - return 0; } static int klp_enable_func(struct klp_func *func) @@ -419,23 +405,15 @@ err: return ret; } -static int klp_disable_object(struct klp_object *obj) +static void klp_disable_object(struct klp_object *obj) { struct klp_func *func; - int ret; - for (func = obj->funcs; func->old_name; func++) { - if (func->state != KLP_ENABLED) - continue; - - ret = klp_disable_func(func); - if (ret) - return ret; - } + for (func = obj->funcs; func->old_name; func++) + if (func->state == KLP_ENABLED) + klp_disable_func(func); obj->state = KLP_DISABLED; - - return 0; } static int klp_enable_object(struct klp_object *obj) @@ -451,22 +429,19 @@ static int klp_enable_object(struct klp_object *obj) for (func = obj->funcs; func->old_name; func++) { ret = klp_enable_func(func); - if (ret) - goto unregister; + if (ret) { + klp_disable_object(obj); + return ret; + } } obj->state = KLP_ENABLED; return 0; - -unregister: - WARN_ON(klp_disable_object(obj)); - return ret; } static int __klp_disable_patch(struct klp_patch *patch) { struct klp_object *obj; - int ret; /* enforce stacking: only the last enabled patch can be disabled */ if (!list_is_last(&patch->list, &klp_patches) && @@ -476,12 +451,8 @@ static int __klp_disable_patch(struct klp_patch *patch) pr_notice("disabling patch '%s'\n", patch->mod->name); for (obj = patch->objs; obj->funcs; obj++) { - if (obj->state != KLP_ENABLED) - continue; - - ret = klp_disable_object(obj); - if (ret) - return ret; + if (obj->state == KLP_ENABLED) + klp_disable_object(obj); } patch->state = KLP_DISABLED; @@ -931,7 +902,6 @@ static void klp_module_notify_going(struct klp_patch *patch, { struct module *pmod = patch->mod; struct module *mod = obj->mod; - int ret; if (patch->state == KLP_DISABLED) goto disabled; @@ -939,10 +909,7 @@ static void klp_module_notify_going(struct klp_patch *patch, pr_notice("reverting patch '%s' on unloading module '%s'\n", pmod->name, mod->name); - ret = klp_disable_object(obj); - if (ret) - pr_warn("failed to revert patch '%s' on module '%s' (%d)\n", - pmod->name, mod->name, ret); + klp_disable_object(obj); disabled: klp_free_object_loaded(obj); -- cgit v1.3-8-gc7d7 From c4ce0da8ec62d83c96e29db7dadd6d3985344bb3 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 18 Feb 2015 18:02:13 +0100 Subject: livepatch: RCU protect struct klp_func all the time when used in klp_ftrace_handler() func->new_func has been accessed after rcu_read_unlock() in klp_ftrace_handler() and therefore the access was not protected. Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 69bf3aa3bde8..782172f073c5 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -314,12 +314,12 @@ static void notrace klp_ftrace_handler(unsigned long ip, rcu_read_lock(); func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, stack_node); - rcu_read_unlock(); - if (WARN_ON_ONCE(!func)) - return; + goto unlock; klp_arch_set_pc(regs, (unsigned long)func->new_func); +unlock: + rcu_read_unlock(); } static int klp_disable_func(struct klp_func *func) -- cgit v1.3-8-gc7d7 From 4d3199e4ca8e6670b54dc5ee070ffd54385988e9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 22 Feb 2015 19:31:41 -0800 Subject: locking: Remove ACCESS_ONCE() usage With the new standardized functions, we can replace all ACCESS_ONCE() calls across relevant locking - this includes lockref and seqlock while at it. ACCESS_ONCE() does not work reliably on non-scalar types. For example gcc 4.6 and 4.7 might remove the volatile tag for such accesses during the SRA (scalar replacement of aggregates) step: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 Update the new calls regardless of if it is a scalar type, this is cleaner than having three alternatives. Signed-off-by: Davidlohr Bueso Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Thomas Gleixner Cc: Paul E. McKenney Link: http://lkml.kernel.org/r/1424662301.6539.18.camel@stgolabs.net Signed-off-by: Ingo Molnar --- include/linux/seqlock.h | 6 +++--- kernel/locking/mcs_spinlock.h | 6 +++--- kernel/locking/mutex.c | 8 ++++---- kernel/locking/osq_lock.c | 14 +++++++------- kernel/locking/rwsem-xadd.c | 10 +++++----- lib/lockref.c | 2 +- 6 files changed, 23 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index f5df8f687b4d..5f68d0a391ce 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -108,7 +108,7 @@ static inline unsigned __read_seqcount_begin(const seqcount_t *s) unsigned ret; repeat: - ret = ACCESS_ONCE(s->sequence); + ret = READ_ONCE(s->sequence); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; @@ -127,7 +127,7 @@ repeat: */ static inline unsigned raw_read_seqcount(const seqcount_t *s) { - unsigned ret = ACCESS_ONCE(s->sequence); + unsigned ret = READ_ONCE(s->sequence); smp_rmb(); return ret; } @@ -179,7 +179,7 @@ static inline unsigned read_seqcount_begin(const seqcount_t *s) */ static inline unsigned raw_seqcount_begin(const seqcount_t *s) { - unsigned ret = ACCESS_ONCE(s->sequence); + unsigned ret = READ_ONCE(s->sequence); smp_rmb(); return ret & ~1; } diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index d1fe2ba5bac9..75e114bdf3f2 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -78,7 +78,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) */ return; } - ACCESS_ONCE(prev->next) = node; + WRITE_ONCE(prev->next, node); /* Wait until the lock holder passes the lock down. */ arch_mcs_spin_lock_contended(&node->locked); @@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) static inline void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) { - struct mcs_spinlock *next = ACCESS_ONCE(node->next); + struct mcs_spinlock *next = READ_ONCE(node->next); if (likely(!next)) { /* @@ -100,7 +100,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) if (likely(cmpxchg(lock, node, NULL) == node)) return; /* Wait until the next pointer is set */ - while (!(next = ACCESS_ONCE(node->next))) + while (!(next = READ_ONCE(node->next))) cpu_relax_lowlatency(); } diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 43bf25ef3c81..16b2d3cc88b0 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -266,7 +266,7 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = ACCESS_ONCE(lock->owner); + owner = READ_ONCE(lock->owner); if (owner) retval = owner->on_cpu; rcu_read_unlock(); @@ -340,7 +340,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * As such, when deadlock detection needs to be * performed the optimistic spinning cannot be done. */ - if (ACCESS_ONCE(ww->ctx)) + if (READ_ONCE(ww->ctx)) break; } @@ -348,7 +348,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * If there's an owner, wait for it to either * release the lock or go to sleep. */ - owner = ACCESS_ONCE(lock->owner); + owner = READ_ONCE(lock->owner); if (owner && !mutex_spin_on_owner(lock, owner)) break; @@ -487,7 +487,7 @@ static inline int __sched __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); - struct ww_acquire_ctx *hold_ctx = ACCESS_ONCE(ww->ctx); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); if (!hold_ctx) return 0; diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index c112d00341b0..dc85ee23a26f 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -98,7 +98,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) prev = decode_cpu(old); node->prev = prev; - ACCESS_ONCE(prev->next) = node; + WRITE_ONCE(prev->next, node); /* * Normally @prev is untouchable after the above store; because at that @@ -109,7 +109,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) * cmpxchg in an attempt to undo our queueing. */ - while (!ACCESS_ONCE(node->locked)) { + while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. */ @@ -148,7 +148,7 @@ unqueue: * Or we race against a concurrent unqueue()'s step-B, in which * case its step-C will write us a new @node->prev pointer. */ - prev = ACCESS_ONCE(node->prev); + prev = READ_ONCE(node->prev); } /* @@ -170,8 +170,8 @@ unqueue: * it will wait in Step-A. */ - ACCESS_ONCE(next->prev) = prev; - ACCESS_ONCE(prev->next) = next; + WRITE_ONCE(next->prev, prev); + WRITE_ONCE(prev->next, next); return false; } @@ -193,11 +193,11 @@ void osq_unlock(struct optimistic_spin_queue *lock) node = this_cpu_ptr(&osq_node); next = xchg(&node->next, NULL); if (next) { - ACCESS_ONCE(next->locked) = 1; + WRITE_ONCE(next->locked, 1); return; } next = osq_wait_next(lock, node, NULL); if (next) - ACCESS_ONCE(next->locked) = 1; + WRITE_ONCE(next->locked, 1); } diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e4ad019e23f5..06e2214edf98 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -279,7 +279,7 @@ static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem) */ static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem) { - long old, count = ACCESS_ONCE(sem->count); + long old, count = READ_ONCE(sem->count); while (true) { if (!(count == 0 || count == RWSEM_WAITING_BIAS)) @@ -304,9 +304,9 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return false; rcu_read_lock(); - owner = ACCESS_ONCE(sem->owner); + owner = READ_ONCE(sem->owner); if (!owner) { - long count = ACCESS_ONCE(sem->count); + long count = READ_ONCE(sem->count); /* * If sem->owner is not set, yet we have just recently entered the * slowpath with the lock being active, then there is a possibility @@ -385,7 +385,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) goto done; while (true) { - owner = ACCESS_ONCE(sem->owner); + owner = READ_ONCE(sem->owner); if (owner && !rwsem_spin_on_owner(sem, owner)) break; @@ -459,7 +459,7 @@ struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem) /* we're now waiting on the lock, but no longer actively locking */ if (waiting) { - count = ACCESS_ONCE(sem->count); + count = READ_ONCE(sem->count); /* * If there were already threads queued before us and there are diff --git a/lib/lockref.c b/lib/lockref.c index ecb9a665ec19..494994bf17c8 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -18,7 +18,7 @@ #define CMPXCHG_LOOP(CODE, SUCCESS) do { \ struct lockref old; \ BUILD_BUG_ON(sizeof(old) != 8); \ - old.lock_count = ACCESS_ONCE(lockref->lock_count); \ + old.lock_count = READ_ONCE(lockref->lock_count); \ while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \ struct lockref new = old, prev = old; \ CODE \ -- cgit v1.3-8-gc7d7 From 39bed6cbb842d8edf5a26b01122b391d36775b5e Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:40 +0000 Subject: perf: Make perf_cgroup_from_task() global Move perf_cgroup_from_task() from kernel/events/ to include/linux/ along with the necessary struct definitions, so that it can be used by the PMU code. When the upcoming Intel Cache Monitoring PMU driver assigns monitoring IDs to perf events, it needs to be able to check whether any two monitoring events overlap (say, a cgroup and task event), which means we need to be able to lookup the cgroup associated with a task (if any). Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Paul Mackerras Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-2-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 30 ++++++++++++++++++++++++++++++ kernel/events/core.c | 28 +--------------------------- 2 files changed, 31 insertions(+), 27 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 724d3720c9b1..cae4a9481777 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -53,6 +53,7 @@ struct perf_guest_info_callbacks { #include #include #include +#include #include struct perf_callchain_entry { @@ -547,6 +548,35 @@ struct perf_output_handle { int page; }; +#ifdef CONFIG_CGROUP_PERF + +/* + * perf_cgroup_info keeps track of time_enabled for a cgroup. + * This is a per-cpu dynamically allocated data structure. + */ +struct perf_cgroup_info { + u64 time; + u64 timestamp; +}; + +struct perf_cgroup { + struct cgroup_subsys_state css; + struct perf_cgroup_info __percpu *info; +}; + +/* + * Must ensure cgroup is pinned (css_get) before calling + * this function. In other words, we cannot call this function + * if there is no cgroup event for the current CPU context. + */ +static inline struct perf_cgroup * +perf_cgroup_from_task(struct task_struct *task) +{ + return container_of(task_css(task, perf_event_cgrp_id), + struct perf_cgroup, css); +} +#endif /* CONFIG_CGROUP_PERF */ + #ifdef CONFIG_PERF_EVENTS extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); diff --git a/kernel/events/core.c b/kernel/events/core.c index 20cece0a7aea..072de3143244 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -34,11 +34,11 @@ #include #include #include +#include #include #include #include #include -#include #include #include #include @@ -351,32 +351,6 @@ static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, #ifdef CONFIG_CGROUP_PERF -/* - * perf_cgroup_info keeps track of time_enabled for a cgroup. - * This is a per-cpu dynamically allocated data structure. - */ -struct perf_cgroup_info { - u64 time; - u64 timestamp; -}; - -struct perf_cgroup { - struct cgroup_subsys_state css; - struct perf_cgroup_info __percpu *info; -}; - -/* - * Must ensure cgroup is pinned (css_get) before calling - * this function. In other words, we cannot call this function - * if there is no cgroup event for the current CPU context. - */ -static inline struct perf_cgroup * -perf_cgroup_from_task(struct task_struct *task) -{ - return container_of(task_css(task, perf_event_cgrp_id), - struct perf_cgroup, css); -} - static inline bool perf_cgroup_match(struct perf_event *event) { -- cgit v1.3-8-gc7d7 From eacd3ecc34472ce3751eedfc94e44c7cc6eb6305 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:41 +0000 Subject: perf: Add ->count() function to read per-package counters For PMU drivers that record per-package counters, the ->count variable cannot be used to record an accurate aggregated value, since it's not possible to perform SMP cross-calls to cpus on other packages from the context in which we update ->count. Introduce a new optional ->count() accessor function that can be used to customize how values are collected. If a PMU driver doesn't provide a ->count() function, we fallback to the existing code. There is necessarily a window of staleness with this approach because the task that generated the counter value may not have been scheduled by the cpu recently. An alternative and more complex approach would be to use a hrtimer to periodically refresh the values from a more permissive scheduling context. So, we're trading off complexity for accuracy. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-3-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 10 ++++++++++ kernel/events/core.c | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index cae4a9481777..9fc9b0d31442 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -272,6 +272,11 @@ struct pmu { */ size_t task_ctx_size; + + /* + * Return the count value for a counter. + */ + u64 (*count) (struct perf_event *event); /*optional*/ }; /** @@ -770,6 +775,11 @@ static inline void perf_event_task_sched_out(struct task_struct *prev, __perf_event_task_sched_out(prev, next); } +static inline u64 __perf_event_count(struct perf_event *event) +{ + return local64_read(&event->count) + atomic64_read(&event->child_count); +} + extern void perf_event_mmap(struct vm_area_struct *vma); extern struct perf_guest_info_callbacks *perf_guest_cbs; extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks); diff --git a/kernel/events/core.c b/kernel/events/core.c index 072de3143244..4e8dc596f101 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3194,7 +3194,10 @@ static void __perf_event_read(void *info) static inline u64 perf_event_count(struct perf_event *event) { - return local64_read(&event->count) + atomic64_read(&event->child_count); + if (event->pmu->count) + return event->pmu->count(event); + + return __perf_event_count(event); } static u64 perf_event_read(struct perf_event *event) -- cgit v1.3-8-gc7d7 From 79dff51e900fd26a073be8b23acfbd8c15edb181 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:42 +0000 Subject: perf: Move cgroup init before PMU ->event_init() The Intel QoS PMU needs to know whether an event is part of a cgroup during ->event_init(), because tasks in the same cgroup share a monitoring ID. Move the cgroup initialisation before calling into the PMU driver. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Link: http://lkml.kernel.org/r/1422038748-21397-4-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- kernel/events/core.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 4e8dc596f101..1fc3bae5904a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7116,7 +7116,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *group_leader, struct perf_event *parent_event, perf_overflow_handler_t overflow_handler, - void *context) + void *context, int cgroup_fd) { struct pmu *pmu; struct perf_event *event; @@ -7212,6 +7212,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (!has_branch_stack(event)) event->attr.branch_sample_type = 0; + if (cgroup_fd != -1) { + err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader); + if (err) + goto err_ns; + } + pmu = perf_init_event(event); if (!pmu) goto err_ns; @@ -7235,6 +7241,8 @@ err_pmu: event->destroy(event); module_put(pmu->module); err_ns: + if (is_cgroup_event(event)) + perf_detach_cgroup(event); if (event->ns) put_pid_ns(event->ns); kfree(event); @@ -7453,6 +7461,7 @@ SYSCALL_DEFINE5(perf_event_open, int move_group = 0; int err; int f_flags = O_RDWR; + int cgroup_fd = -1; /* for future expandability... */ if (flags & ~PERF_FLAG_ALL) @@ -7518,21 +7527,16 @@ SYSCALL_DEFINE5(perf_event_open, get_online_cpus(); + if (flags & PERF_FLAG_PID_CGROUP) + cgroup_fd = pid; + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, - NULL, NULL); + NULL, NULL, cgroup_fd); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_cpus; } - if (flags & PERF_FLAG_PID_CGROUP) { - err = perf_cgroup_connect(pid, event, &attr, group_leader); - if (err) { - __free_event(event); - goto err_cpus; - } - } - if (is_sampling_event(event)) { if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) { err = -ENOTSUPP; @@ -7769,7 +7773,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, */ event = perf_event_alloc(attr, cpu, task, NULL, NULL, - overflow_handler, context); + overflow_handler, context, -1); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -8130,7 +8134,7 @@ inherit_event(struct perf_event *parent_event, parent_event->cpu, child, group_leader, parent_event, - NULL, NULL); + NULL, NULL, -1); if (IS_ERR(child_event)) return child_event; -- cgit v1.3-8-gc7d7 From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 23 Jan 2015 18:45:46 +0000 Subject: perf/x86/intel: Support task events with Intel CQM Add support for task events as well as system-wide events. This change has a big impact on the way that we gather LLC occupancy values in intel_cqm_event_read(). Currently, for system-wide (per-cpu) events we defer processing to userspace which knows how to discard all but one cpu result per package. Things aren't so simple for task events because we need to do the value aggregation ourselves. To do this, we defer updating the LLC occupancy value in event->count from intel_cqm_event_read() and do an SMP cross-call to read values for all packages in intel_cqm_event_count(). We need to ensure that we only do this for one task event per cache group, otherwise we'll report duplicate values. If we're a system-wide event we want to fallback to the default perf_event_count() implementation. Refactor this into a common function so that we don't duplicate the code. Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an event's task (if the event isn't per-cpu) inside of the Intel CQM PMU driver. This task information is only availble in the upper layers of the perf infrastructure. Other perf backends stash the target task in event->hw.*target so we need to do something similar. The task is used to determine whether events should share a cache group and an RMID. Signed-off-by: Matt Fleming Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: H. Peter Anvin Cc: Jiri Olsa Cc: Kanaka Juvva Cc: Linus Torvalds Cc: Vikas Shivappa Cc: linux-api@vger.kernel.org Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 195 +++++++++++++++++++++++++---- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 1 + kernel/events/core.c | 2 + 4 files changed, 178 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index b5d9d746dbc0..8003d87afd89 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -182,23 +182,124 @@ fail: /* * Determine if @a and @b measure the same set of tasks. + * + * If @a and @b measure the same set of tasks then we want to share a + * single RMID. */ static bool __match_event(struct perf_event *a, struct perf_event *b) { + /* Per-cpu and task events don't mix */ if ((a->attach_state & PERF_ATTACH_TASK) != (b->attach_state & PERF_ATTACH_TASK)) return false; - /* not task */ +#ifdef CONFIG_CGROUP_PERF + if (a->cgrp != b->cgrp) + return false; +#endif + + /* If not task event, we're machine wide */ + if (!(b->attach_state & PERF_ATTACH_TASK)) + return true; + + /* + * Events that target same task are placed into the same cache group. + */ + if (a->hw.cqm_target == b->hw.cqm_target) + return true; + + /* + * Are we an inherited event? + */ + if (b->parent == a) + return true; + + return false; +} + +#ifdef CONFIG_CGROUP_PERF +static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) +{ + if (event->attach_state & PERF_ATTACH_TASK) + return perf_cgroup_from_task(event->hw.cqm_target); - return true; /* if not task, we're machine wide */ + return event->cgrp; } +#endif /* * Determine if @a's tasks intersect with @b's tasks + * + * There are combinations of events that we explicitly prohibit, + * + * PROHIBITS + * system-wide -> cgroup and task + * cgroup -> system-wide + * -> task in cgroup + * task -> system-wide + * -> task in cgroup + * + * Call this function before allocating an RMID. */ static bool __conflict_event(struct perf_event *a, struct perf_event *b) { +#ifdef CONFIG_CGROUP_PERF + /* + * We can have any number of cgroups but only one system-wide + * event at a time. + */ + if (a->cgrp && b->cgrp) { + struct perf_cgroup *ac = a->cgrp; + struct perf_cgroup *bc = b->cgrp; + + /* + * This condition should have been caught in + * __match_event() and we should be sharing an RMID. + */ + WARN_ON_ONCE(ac == bc); + + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } + + if (a->cgrp || b->cgrp) { + struct perf_cgroup *ac, *bc; + + /* + * cgroup and system-wide events are mutually exclusive + */ + if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) || + (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK))) + return true; + + /* + * Ensure neither event is part of the other's cgroup + */ + ac = event_to_cgroup(a); + bc = event_to_cgroup(b); + if (ac == bc) + return true; + + /* + * Must have cgroup and non-intersecting task events. + */ + if (!ac || !bc) + return false; + + /* + * We have cgroup and task events, and the task belongs + * to a cgroup. Check for for overlap. + */ + if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) || + cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup)) + return true; + + return false; + } +#endif /* * If one of them is not a task, same story as above with cgroups. */ @@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event, static void intel_cqm_event_read(struct perf_event *event) { - unsigned long rmid = event->hw.cqm_rmid; + unsigned long rmid; u64 val; + /* + * Task events are handled by intel_cqm_event_count(). + */ + if (event->cpu == -1) + return; + + rmid = event->hw.cqm_rmid; val = __rmid_read(rmid); /* @@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event) local64_set(&event->count, val); } +struct rmid_read { + unsigned int rmid; + atomic64_t value; +}; + +static void __intel_cqm_event_count(void *info) +{ + struct rmid_read *rr = info; + u64 val; + + val = __rmid_read(rr->rmid); + + if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) + return; + + atomic64_add(val, &rr->value); +} + +static inline bool cqm_group_leader(struct perf_event *event) +{ + return !list_empty(&event->hw.cqm_groups_entry); +} + +static u64 intel_cqm_event_count(struct perf_event *event) +{ + struct rmid_read rr = { + .rmid = event->hw.cqm_rmid, + .value = ATOMIC64_INIT(0), + }; + + /* + * We only need to worry about task events. System-wide events + * are handled like usual, i.e. entirely with + * intel_cqm_event_read(). + */ + if (event->cpu != -1) + return __perf_event_count(event); + + /* + * Only the group leader gets to report values. This stops us + * reporting duplicate values to userspace, and gives us a clear + * rule for which task gets to report the values. + * + * Note that it is impossible to attribute these values to + * specific packages - we forfeit that ability when we create + * task events. + */ + if (!cqm_group_leader(event)) + return 0; + + on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1); + + local64_set(&event->count, atomic64_read(&rr.value)); + + return __perf_event_count(event); +} + static void intel_cqm_event_start(struct perf_event *event, int mode) { struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); @@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event) /* * And we're the group leader.. */ - if (!list_empty(&event->hw.cqm_groups_entry)) { + if (cqm_group_leader(event)) { /* * If there was a group_other, make that leader, otherwise * destroy the group and return the RMID. @@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event) static struct pmu intel_cqm_pmu; -/* - * XXX there's a bit of a problem in that we cannot simply do the one - * event per node as one would want, since that one event would one get - * scheduled on the one cpu. But we want to 'schedule' the RMID on all - * CPUs. - * - * This means we want events for each CPU, however, that generates a lot - * of duplicate values out to userspace -- this is not to be helped - * unless we want to change the core code in some way. Fore more info, - * see intel_cqm_event_read(). - */ static int intel_cqm_event_init(struct perf_event *event) { struct perf_event *group = NULL; @@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event) if (event->attr.config & ~QOS_EVENT_MASK) return -EINVAL; - if (event->cpu == -1) - return -EINVAL; - /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || @@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event) mutex_lock(&cache_mutex); - err = intel_cqm_setup_event(event, &group); /* will also set rmid */ + /* Will also set rmid */ + err = intel_cqm_setup_event(event, &group); if (err) goto out; @@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = { .start = intel_cqm_event_start, .stop = intel_cqm_event_stop, .read = intel_cqm_event_read, + .count = intel_cqm_event_count, }; static inline void cqm_pick_event_reader(int cpu) @@ -599,8 +752,8 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); - + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", + PERF_TYPE_INTEL_CQM); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ca5504c48f4f..dac4c2831d82 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,6 +129,7 @@ struct hw_perf_event { struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; + struct task_struct *cqm_target; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1e3cd07cf76e..3c8b45de57ec 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -32,6 +32,7 @@ enum perf_type_id { PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, PERF_TYPE_BREAKPOINT = 5, + PERF_TYPE_INTEL_CQM = 6, PERF_TYPE_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 1fc3bae5904a..71109a045450 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, else if (attr->type == PERF_TYPE_BREAKPOINT) event->hw.bp_target = task; #endif + else if (attr->type == PERF_TYPE_INTEL_CQM) + event->hw.cqm_target = task; } if (!overflow_handler && parent_event) { -- cgit v1.3-8-gc7d7 From 1d4a9c17d4d204a159139361e8d4db7f9f267879 Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Sun, 22 Feb 2015 21:16:49 -0800 Subject: PM / sleep: add configurable delay for pm_test When CONFIG_PM_DEBUG=y, we provide a sysfs file (/sys/power/pm_test) for selecting one of a few suspend test modes, where rather than entering a full suspend state, the kernel will perform some subset of suspend steps, wait 5 seconds, and then resume back to normal operation. This mode is useful for (among other things) observing the state of the system just before entering a sleep mode, for debugging or analysis purposes. However, a constant 5 second wait is not sufficient for some sorts of analysis; for example, on an SoC, one might want to use external tools to probe the power states of various on-chip controllers or clocks. This patch turns this 5 second delay into a configurable module parameter, so users can determine how long to wait in this pseudo-suspend state before resuming the system. Example (wait 30 seconds); # echo 30 > /sys/module/suspend/parameters/pm_test_delay # echo core > /sys/power/pm_test # time echo mem > /sys/power/state ... [ 17.583625] suspend debug: Waiting for 30 second(s). ... real 0m30.381s user 0m0.017s sys 0m0.080s Signed-off-by: Brian Norris Acked-by: Pavel Machek Reviewed-by: Kevin Cernekee Acked-by: Florian Fainelli Signed-off-by: Rafael J. Wysocki --- Documentation/kernel-parameters.txt | 7 +++++++ Documentation/power/basic-pm-debugging.txt | 10 ++++++---- kernel/power/suspend.c | 13 +++++++++++-- 3 files changed, 24 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bfcb1a62a7b4..8b1fa5e129ac 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -3462,6 +3462,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted. improve throughput, but will also increase the amount of memory reserved for use by the client. + suspend.pm_test_delay= + [SUSPEND] + Sets the number of seconds to remain in a suspend test + mode before resuming the system (see + /sys/power/pm_test). Only available when CONFIG_PM_DEBUG + is set. Default value is 5. + swapaccount=[0|1] [KNL] Enable accounting of swap in memory resource controller if no parameter or 1 is given or disable diff --git a/Documentation/power/basic-pm-debugging.txt b/Documentation/power/basic-pm-debugging.txt index edeecd447d23..b96098ccfe69 100644 --- a/Documentation/power/basic-pm-debugging.txt +++ b/Documentation/power/basic-pm-debugging.txt @@ -75,12 +75,14 @@ you should do the following: # echo platform > /sys/power/disk # echo disk > /sys/power/state -Then, the kernel will try to freeze processes, suspend devices, wait 5 seconds, -resume devices and thaw processes. If "platform" is written to +Then, the kernel will try to freeze processes, suspend devices, wait a few +seconds (5 by default, but configurable by the suspend.pm_test_delay module +parameter), resume devices and thaw processes. If "platform" is written to /sys/power/pm_test , then after suspending devices the kernel will additionally invoke the global control methods (eg. ACPI global control methods) used to -prepare the platform firmware for hibernation. Next, it will wait 5 seconds and -invoke the platform (eg. ACPI) global methods used to cancel hibernation etc. +prepare the platform firmware for hibernation. Next, it will wait a +configurable number of seconds and invoke the platform (eg. ACPI) global +methods used to cancel hibernation etc. Writing "none" to /sys/power/pm_test causes the kernel to switch to the normal hibernation/suspend operations. Also, when open for reading, /sys/power/pm_test diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index b7d6b3a721b1..8d7a1ef72758 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -28,6 +28,7 @@ #include #include #include +#include #include "power.h" @@ -233,12 +234,20 @@ static bool platform_suspend_again(suspend_state_t state) suspend_ops->suspend_again() : false; } +#ifdef CONFIG_PM_DEBUG +static unsigned int pm_test_delay = 5; +module_param(pm_test_delay, uint, 0644); +MODULE_PARM_DESC(pm_test_delay, + "Number of seconds to wait before resuming from suspend test"); +#endif + static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); - mdelay(5000); + printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pm_test_delay); + mdelay(pm_test_delay * 1000); return 1; } #endif /* !CONFIG_PM_DEBUG */ -- cgit v1.3-8-gc7d7 From ee376dbdf27728a2f3d30e2ba10fa387cc4c645b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 10 Jan 2015 19:47:10 -0800 Subject: rcu: Consolidate rcu_synchronize and wakeme_after_rcu() There are currently duplicate identical definitions of the rcu_synchronize() structure and the wakeme_after_rcu() function. Thie commit therefore consolidates them. Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 9 +++++++++ kernel/rcu/srcu.c | 17 ----------------- kernel/rcu/update.c | 15 ++++++--------- 3 files changed, 15 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 78097491cd99..3e6afed51051 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -195,6 +195,15 @@ void call_rcu_sched(struct rcu_head *head, void synchronize_sched(void); +/* + * Structure allowing asynchronous waiting on RCU. + */ +struct rcu_synchronize { + struct rcu_head head; + struct completion completion; +}; +void wakeme_after_rcu(struct rcu_head *head); + /** * call_rcu_tasks() - Queue an RCU for invocation task-based grace period * @head: structure to be used for queueing the RCU updates. diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 445bf8ffe3fb..81f53b504c18 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, } EXPORT_SYMBOL_GPL(call_srcu); -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_srcu() instance now that a - * grace period has elapsed. - */ -static void wakeme_after_rcu(struct rcu_head *head) -{ - struct rcu_synchronize *rcu; - - rcu = container_of(head, struct rcu_synchronize, head); - complete(&rcu->completion); -} - static void srcu_advance_batches(struct srcu_struct *sp, int trycount); static void srcu_reschedule(struct srcu_struct *sp); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e0d31a345ee6..8864ed90f0d7 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -199,16 +199,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -struct rcu_synchronize { - struct rcu_head head; - struct completion completion; -}; - -/* - * Awaken the corresponding synchronize_rcu() instance now that a - * grace period has elapsed. +/** + * wakeme_after_rcu() - Callback function to awaken a task after grace period + * @head: Pointer to rcu_head member within rcu_synchronize structure + * + * Awaken the corresponding task now that a grace period has elapsed. */ -static void wakeme_after_rcu(struct rcu_head *head) +void wakeme_after_rcu(struct rcu_head *head) { struct rcu_synchronize *rcu; -- cgit v1.3-8-gc7d7 From 3f47da0f32f5e43e6ae901129d5b9c2600011a2c Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 13 Jan 2015 15:30:34 +0800 Subject: rcu_tree: Avoid touching rnp->completed when a new GP is started In rcu_gp_init(), rnp->completed equals to rsp->completed in THEORY, we don't need to touch it normally. If something goes wrong, it will complain and fixup rnp->completed and avoid oops. This commit thus avoids the normal needless store to rnp->completed. Signed-off-by: Lai Jiangshan Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..077d0b700f74 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1757,8 +1757,8 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; - WARN_ON_ONCE(rnp->completed != rsp->completed); - ACCESS_ONCE(rnp->completed) = rsp->completed; + if (WARN_ON_ONCE(rnp->completed != rsp->completed)) + ACCESS_ONCE(rnp->completed) = rsp->completed; if (rnp == rdp->mynode) (void)__note_gp_changes(rsp, rnp, rdp); rcu_preempt_boost_start_gp(rnp); -- cgit v1.3-8-gc7d7 From d3f3f3f25b1d4ee152f3f19a812c3a282da4c120 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Jan 2015 18:21:09 -0800 Subject: rcu: Abstract default callback-list initialization from init_callback_list() In preparation for early-boot posting of callbacks, this commit abstracts initialization of the default (non-no-CB) callbacks list from the init_callback_list() function into a new init_default_callback_list() function. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..f8cdb92da10b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1328,19 +1328,29 @@ void rcu_cpu_stall_reset(void) } /* - * Initialize the specified rcu_data structure's callback list to empty. + * Initialize the specified rcu_data structure's default callback list + * to empty. The default callback list is the one that is not used by + * no-callbacks CPUs. */ -static void init_callback_list(struct rcu_data *rdp) +static void init_default_callback_list(struct rcu_data *rdp) { int i; - if (init_nocb_callback_list(rdp)) - return; rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) rdp->nxttail[i] = &rdp->nxtlist; } +/* + * Initialize the specified rcu_data structure's callback list to empty. + */ +static void init_callback_list(struct rcu_data *rdp) +{ + if (init_nocb_callback_list(rdp)) + return; + init_default_callback_list(rdp); +} + /* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that -- cgit v1.3-8-gc7d7 From 2723249a31a68ccc0ec8ac59a905d7f9430bf8f6 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jan 2015 22:44:13 -0800 Subject: rcu: Wire ->rda pointers at compile time This commit wires up the rcu_state structures' ->rda pointers to the per-CPU rcu_data structures at compile time, thus ensuring that this linkage is present at early boot, in turn allowing posting of callbacks before rcu_init() is executed. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f8cdb92da10b..d2fa95e4a268 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ DEFINE_RCU_TPS(sname) \ +DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ struct rcu_state sname##_state = { \ .level = { &sname##_state.node[0] }, \ + .rda = &sname##_data, \ .call = cr, \ .fqs_state = RCU_GP_IDLE, \ .gpnum = 0UL - 300UL, \ @@ -104,8 +106,7 @@ struct rcu_state sname##_state = { \ .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ -}; \ -DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data) +} RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); @@ -3843,7 +3844,6 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } - rsp->rda = rda; init_waitqueue_head(&rsp->gp_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { -- cgit v1.3-8-gc7d7 From 143da9c2fc030a5774674f2ebc2f934fab3dcd9a Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 19:57:32 -0800 Subject: rcu: Prevent early-boot RCU callbacks from splatting Currently, a call_rcu() that precedes rcu_init() will splat due to the callback lists not having yet been initialized. This commit causes the first such callback to initialize the boot CPU's RCU callback list. Note that this commit does not change rcu_init()-time initialization, which means that the callback will be discarded at rcu_init() time. Fixing this is the job of later commits. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d2fa95e4a268..fcfdbe53bb70 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2838,11 +2838,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), if (cpu != -1) rdp = per_cpu_ptr(rsp->rda, cpu); - offline = !__call_rcu_nocb(rdp, head, lazy, flags); - WARN_ON_ONCE(offline); - /* _call_rcu() is illegal on offline CPU; leak the callback. */ - local_irq_restore(flags); - return; + if (likely(rdp->mynode)) { + /* Post-boot, so this should be for a no-CBs CPU. */ + offline = !__call_rcu_nocb(rdp, head, lazy, flags); + WARN_ON_ONCE(offline); + /* Offline CPU, _call_rcu() illegal, leak callback. */ + local_irq_restore(flags); + return; + } + /* + * Very early boot, before rcu_init(). Initialize if needed + * and then drop through to queue the callback. + */ + BUG_ON(cpu != -1); + if (!likely(rdp->nxtlist)) + init_default_callback_list(rdp); } ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1; if (lazy) -- cgit v1.3-8-gc7d7 From 59f792d1ef214592ae9b86238fa8fd00f5929b76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 21:43:40 -0800 Subject: rcu: Refine diagnostics for lacking kthread for no-CBs callbacks Some diagnostics under CONFIG_PROVE_RCU in rcu_nocb_cpu_needs_barrier() assume that there can be no early-boot callbacks. This commit therefore qualifies the diagnostic with rcu_scheduler_fully_active to permit early boot callbacks to avoid this splat. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..75d5f096bcb0 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1945,7 +1945,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) rhp = ACCESS_ONCE(rdp->nocb_follower_head); /* Having no rcuo kthread but CBs after scheduler starts is bad! */ - if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) { + if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp && + rcu_scheduler_fully_active) { /* RCU callback enqueued before CPU first came online??? */ pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n", cpu, rhp->func); -- cgit v1.3-8-gc7d7 From 39c8d313c3c546a414cc51b4f6571c2f8cc06407 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 20 Jan 2015 23:42:38 -0800 Subject: rcu: Avoid clobbering early boot callbacks When a CPU comes online, it initializes its callback list. This is a bad thing if this is the first time that the CPU has come online and if that CPU has early boot callbacks. This commit therefore avoid initializing the callback list if there are callbacks present, in which case the initial call_rcu() did the initialization for us. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fcfdbe53bb70..92fd3eab5823 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3583,7 +3583,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (!rdp->nxtlist) + init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, -- cgit v1.3-8-gc7d7 From 1925d1967c93a1c421271aade7953f6857e9f579 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Jan 2015 17:45:05 -0800 Subject: rcu: Fix a couple of typos in rcu_all_qs() comment header Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 077d0b700f74..4e37c7fd9e29 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -292,10 +292,10 @@ void rcu_note_context_switch(void) EXPORT_SYMBOL_GPL(rcu_note_context_switch); /* - * Register a quiesecent state for all RCU flavors. If there is an + * Register a quiescent state for all RCU flavors. If there is an * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight * dyntick-idle quiescent state visible to other CPUs (but only for those - * RCU flavors in desparate need of a quiescent state, which will normally + * RCU flavors in desperate need of a quiescent state, which will normally * be none of them). Either way, do a lightweight quiescent state for * all RCU flavors. */ -- cgit v1.3-8-gc7d7 From 0d39482c3db13aae1db143d340816108dd53e443 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Feb 2015 12:24:30 -0800 Subject: rcu: Provide rcu_expedite_gp() and rcu_unexpedite_gp() Currently, expediting of normal synchronous grace-period primitives (synchronize_rcu() and friends) is controlled by the rcu_expedited() boot/sysfs parameter. This works well, but does not handle nesting. This commit therefore provides rcu_expedite_gp() to enable expediting and rcu_unexpedite_gp() to cancel a prior rcu_expedite_gp(), both of which support nesting. Reported-by: Arjan van de Ven Signed-off-by: Paul E. McKenney --- include/linux/rcupdate.h | 20 ++++++++++++++++++++ kernel/rcu/update.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 68 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 78097491cd99..57a4d1f73a00 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -48,6 +48,26 @@ extern int rcu_expedited; /* for sysctl */ +#ifdef CONFIG_TINY_RCU +/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ +static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */ +{ + return false; +} + +static inline void rcu_expedite_gp(void) +{ +} + +static inline void rcu_unexpedite_gp(void) +{ +} +#else /* #ifdef CONFIG_TINY_RCU */ +bool rcu_gp_is_expedited(void); /* Internal RCU use. */ +void rcu_expedite_gp(void); +void rcu_unexpedite_gp(void); +#endif /* #else #ifdef CONFIG_TINY_RCU */ + enum rcutorture_type { RCU_FLAVOR, RCU_BH_FLAVOR, diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index e0d31a345ee6..5f850823c187 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -62,6 +62,54 @@ MODULE_ALIAS("rcupdate"); module_param(rcu_expedited, int, 0); +#ifndef CONFIG_TINY_RCU + +static atomic_t rcu_expedited_nesting; + +/* + * Should normal grace-period primitives be expedited? Intended for + * use within RCU. Note that this function takes the rcu_expedited + * sysfs/boot variable into account as well as the rcu_expedite_gp() + * nesting. So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited() + * returns false is a -really- bad idea. + */ +bool rcu_gp_is_expedited(void) +{ + return rcu_expedited || atomic_read(&rcu_expedited_nesting); +} +EXPORT_SYMBOL_GPL(rcu_gp_is_expedited); + +/** + * rcu_expedite_gp - Expedite future RCU grace periods + * + * After a call to this function, future calls to synchronize_rcu() and + * friends act as the corresponding synchronize_rcu_expedited() function + * had instead been called. + */ +void rcu_expedite_gp(void) +{ + atomic_inc(&rcu_expedited_nesting); +} +EXPORT_SYMBOL_GPL(rcu_expedite_gp); + +/** + * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation + * + * Undo a prior call to rcu_expedite_gp(). If all prior calls to + * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(), + * and if the rcu_expedited sysfs/boot parameter is not set, then all + * subsequent calls to synchronize_rcu() and friends will return to + * their normal non-expedited behavior. + */ +void rcu_unexpedite_gp(void) +{ + atomic_dec(&rcu_expedited_nesting); +} +EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); + +#endif /* #ifndef CONFIG_TINY_RCU */ + + #ifdef CONFIG_PREEMPT_RCU /* -- cgit v1.3-8-gc7d7 From 4bb3c5f4142a359de46cf14ebab64c4c903d6773 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Feb 2015 16:31:29 -0800 Subject: rcu: Add rcu_expedite_gp() and rcu_unexpedite_gp() to rcutorture Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 30d42aa55d83..3833aa611ae7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg) static int rcu_torture_writer(void *arg) { + bool can_expedite = !rcu_gp_is_expedited(); + int expediting = 0; unsigned long gp_snap; bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal; bool gp_sync1 = gp_sync; @@ -865,6 +867,12 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Testing of dynamic grace-period expediting diabled.\n", + torture_type); /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) @@ -949,9 +957,26 @@ rcu_torture_writer(void *arg) } } rcutorture_record_progress(++rcu_torture_current_version); + /* Cycle through nesting levels of rcu_expedite_gp() calls. */ + if (can_expedite && + !(torture_random(&rand) & 0xff & (!!expediting - 1))) { + WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited()); + if (expediting >= 0) + rcu_expedite_gp(); + else + rcu_unexpedite_gp(); + if (++expediting > 3) + expediting = -expediting; + } rcu_torture_writer_state = RTWS_STUTTER; stutter_wait("rcu_torture_writer"); } while (!torture_must_stop()); + /* Reset expediting back to unexpedited. */ + if (expediting > 0) + expediting = -expediting; + while (can_expedite && expediting++ < 0) + rcu_unexpedite_gp(); + WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited()); rcu_torture_writer_state = RTWS_STOPPING; torture_kthread_stopping("rcu_torture_writer"); return 0; -- cgit v1.3-8-gc7d7 From 5afff48bdf7481570c9385a8a674a81ffb8f09ee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 18 Feb 2015 16:39:09 -0800 Subject: rcu: Update from rcu_expedited variable to rcu_gp_is_expedited() This commit updates open-coded tests of the rcu_expedited variable to instead use rcu_gp_is_expedited(). Signed-off-by: Paul E. McKenney --- kernel/rcu/srcu.c | 2 +- kernel/rcu/tree.c | 9 +++++---- kernel/rcu/tree_plugin.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 445bf8ffe3fb..c871f07eff69 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -507,7 +507,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_expedited + __synchronize_srcu(sp, rcu_gp_is_expedited() ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..4325fbe79d84 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2954,7 +2954,7 @@ void synchronize_sched(void) "Illegal synchronize_sched() in RCU-sched read-side critical section"); if (rcu_blocking_is_gp()) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_sched_expedited(); else wait_rcu_gp(call_rcu_sched); @@ -2981,7 +2981,7 @@ void synchronize_rcu_bh(void) "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); if (rcu_blocking_is_gp()) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_rcu_bh_expedited(); else wait_rcu_gp(call_rcu_bh); @@ -3660,11 +3660,12 @@ static int rcu_pm_notify(struct notifier_block *self, case PM_HIBERNATION_PREPARE: case PM_SUSPEND_PREPARE: if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ - rcu_expedited = 1; + rcu_expedite_gp(); break; case PM_POST_HIBERNATION: case PM_POST_SUSPEND: - rcu_expedited = 0; + if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ + rcu_unexpedite_gp(); break; default: break; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..63726b734d34 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -585,7 +585,7 @@ void synchronize_rcu(void) "Illegal synchronize_rcu() in RCU read-side critical section"); if (!rcu_scheduler_active) return; - if (rcu_expedited) + if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else wait_rcu_gp(call_rcu); -- cgit v1.3-8-gc7d7 From ee42571f4381f184e2672dd34ab411e5bf5bd5e0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Feb 2015 10:51:32 -0800 Subject: rcu: Add Kconfig option to expedite grace periods during boot This commit adds a CONFIG_RCU_EXPEDITE_BOOT Kconfig parameter that emulates a very early boot rcu_expedite_gp(). A late-boot call to rcu_end_inkernel_boot() will provide the corresponding rcu_unexpedite_gp(). The late-boot call to rcu_end_inkernel_boot() should be made just before init is spawned. According to Arjan: > To show the boot time, I'm using the timestamp of the "Write protecting" > line, that's pretty much the last thing we print prior to ring 3 execution. > > A kernel with default RCU behavior (inside KVM, only virtual devices) > looks like this: > > [ 0.038724] Write protecting the kernel read-only data: 10240k > > a kernel with expedited RCU (using the command line option, so that I > don't have to recompile between measurements and thus am completely > oranges-to-oranges) > > [ 0.031768] Write protecting the kernel read-only data: 10240k > > which, in percentage, is an 18% improvement. Reported-by: Arjan van de Ven Signed-off-by: Paul E. McKenney Tested-by: Arjan van de Ven --- include/linux/rcupdate.h | 1 + init/Kconfig | 13 +++++++++++++ kernel/rcu/update.c | 11 ++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 57a4d1f73a00..b9f039b11d31 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -278,6 +278,7 @@ static inline int rcu_preempt_depth(void) /* Internal to kernel */ void rcu_init(void); +void rcu_end_inkernel_boot(void); void rcu_sched_qs(void); void rcu_bh_qs(void); void rcu_check_callbacks(int user); diff --git a/init/Kconfig b/init/Kconfig index f5dbc6d4261b..9a0592516f48 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -791,6 +791,19 @@ config RCU_NOCB_CPU_ALL endchoice +config RCU_EXPEDITE_BOOT + bool + default n + help + This option enables expedited grace periods at boot time, + as if rcu_expedite_gp() had been invoked early in boot. + The corresponding rcu_unexpedite_gp() is invoked from + rcu_end_inkernel_boot(), which is intended to be invoked + at the end of the kernel-only boot sequence, just before + init is exec'ed. + + Accept the default if unsure. + endmenu # "RCU Subsystem" config BUILD_BIN2C diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f850823c187..7b12466f90bc 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -64,7 +64,8 @@ module_param(rcu_expedited, int, 0); #ifndef CONFIG_TINY_RCU -static atomic_t rcu_expedited_nesting; +static atomic_t rcu_expedited_nesting = + ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); /* * Should normal grace-period primitives be expedited? Intended for @@ -109,6 +110,14 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); #endif /* #ifndef CONFIG_TINY_RCU */ +/* + * Inform RCU of the end of the in-kernel boot sequence. + */ +void rcu_end_inkernel_boot(void) +{ + if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) + rcu_unexpedite_gp(); +} #ifdef CONFIG_PREEMPT_RCU -- cgit v1.3-8-gc7d7 From c136f991049f51212e3d837a9f41708158591869 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 19 Feb 2015 12:15:19 -0800 Subject: rcutorture: Make consistent use of variables The "if" statement at the beginning of rcu_torture_writer() should use the same set of variables. In theory, this does not matter because the corresponding variables (gp_sync and gp_sync1) have the same value at this point in the code, but in practice such puzzles should be removed. This commit therefore makes the use of variables consistent. Signed-off-by: Paul E. McKenney --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3833aa611ae7..8dbe27611ec3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -875,7 +875,7 @@ rcu_torture_writer(void *arg) torture_type); /* Initialize synctype[] array. If none set, take default. */ - if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync) + if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true; if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync) synctype[nsynctypes++] = RTWS_COND_GET; -- cgit v1.3-8-gc7d7 From 675da67f24e2d6d8df0cedf12e59085ed8bbf4e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Feb 2015 15:57:07 -0800 Subject: rcu: Fixes to NO_HZ_FULL sysidle accounting On second and subsequent passes through quiescent-state forcing, the isidle variable was initialized to false, which would prevent full sysidle state from being reached if a grace period needed more than one round of quiescent-state forcing (which most should not). However, the check for offline CPUs in the quiescent-state forcing main loop had the wrong sense, which could prevent CPUs from ever entering full sysidle state. This commit fixes both of these bugs. Given that sysidle is not yet wired up, this has no effect in old kernels, but might have proven frustrating had anyone attempted to wire it up. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..735bd7ee749a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1798,7 +1798,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) fqs_state = RCU_FORCE_QS; } else { /* Handle dyntick-idle and offline CPUs. */ - isidle = false; + isidle = true; force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); } /* Clear flag to prevent immediate re-entry. */ @@ -2596,8 +2596,8 @@ static void force_qs_rnp(struct rcu_state *rsp, bit = 1; for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { if ((rnp->qsmask & bit) != 0) { - if ((rnp->qsmaskinit & bit) != 0) - *isidle = false; + if ((rnp->qsmaskinit & bit) == 0) + *isidle = false; /* Pending hotplug. */ if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) mask |= bit; } -- cgit v1.3-8-gc7d7 From 27153acbe1141ceecf098ca5d24c2ae2714c1a5f Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 11 Feb 2015 15:42:37 +0100 Subject: rcu: Remove unnecessary condition check in rcu_qsctr_help() When the ->curtail and ->donetail pointers differ, ->rcucblist always points to the beginning of the current list and thus cannot be NULL. Therefore, the check ->rcucblist != NULL is redundant and this commit removes it. Cc: "Paul E. McKenney" Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index cc9ceca7bde1..d4e7fe5f3baf 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching); static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { RCU_TRACE(reset_cpu_stall_ticks(rcp)); - if (rcp->rcucblist != NULL && - rcp->donetail != rcp->curtail) { + if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; } -- cgit v1.3-8-gc7d7 From 915e8a4fe45eab871a862f6467ec7e59864735b2 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Wed, 11 Feb 2015 15:42:38 +0100 Subject: rcu: Remove fastpath from __rcu_process_callbacks() The standard code path accommodates a condition when no RCU callbacks are ready to invoke. Since size of the code is a priority for tiny RCU, remove the fast path. Cc: "Paul E. McKenney" Signed-off-by: Alexander Gordeev Signed-off-by: Paul E. McKenney --- kernel/rcu/tiny.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index d4e7fe5f3baf..069742d61c68 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -168,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) unsigned long flags; RCU_TRACE(int cb_count = 0); - /* If no RCU callbacks ready to invoke, just return. */ - if (&rcp->rcucblist == rcp->donetail) { - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); - RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, - !!ACCESS_ONCE(rcp->rcucblist), - need_resched(), - is_idle_task(current), - false)); - return; - } - /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); -- cgit v1.3-8-gc7d7 From 5871968d531f39c23a8e6c69525bb705bca52e04 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Feb 2015 11:05:36 -0800 Subject: rcu: Tighten up affinity and check for sysidle If the RCU grace-period kthread invoking rcu_sysidle_check_cpu() happens to be running on the tick_do_timer_cpu initially, then rcu_bind_gp_kthread() won't bind it. This kthread might then migrate before invoking rcu_gp_fqs(), which will trigger the WARN_ON_ONCE() in rcu_sysidle_check_cpu(). This commit therefore makes rcu_bind_gp_kthread() do the binding even if the kthread is currently on the same CPU. Because this incurs added overhead, this commit also causes each RCU grace-period kthread to invoke rcu_bind_gp_kthread() once at boot rather than at the beginning of each grace period. And as long as rcu_bind_gp_kthread() is being modified, this commit eliminates its #ifdef. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- kernel/rcu/tree_plugin.h | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 735bd7ee749a..a6972c20eaa5 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1707,7 +1707,6 @@ static int rcu_gp_init(struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); ACCESS_ONCE(rsp->gp_activity) = jiffies; - rcu_bind_gp_kthread(); raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); if (!ACCESS_ONCE(rsp->gp_flags)) { @@ -1895,6 +1894,7 @@ static int __noreturn rcu_gp_kthread(void *arg) struct rcu_state *rsp = arg; struct rcu_node *rnp = rcu_get_root(rsp); + rcu_bind_gp_kthread(); for (;;) { /* Handle grace-period start. */ diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..b46c92824db1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2763,7 +2763,8 @@ static void rcu_sysidle_exit(int irq) /* * Check to see if the current CPU is idle. Note that usermode execution - * does not count as idle. The caller must have disabled interrupts. + * does not count as idle. The caller must have disabled interrupts, + * and must be running on tick_do_timer_cpu. */ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) @@ -2784,8 +2785,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, if (!*isidle || rdp->rsp != rcu_state_p || cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) return; - if (rcu_gp_in_progress(rdp->rsp)) - WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); + /* Verify affinity of current kthread. */ + WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); /* Pick up current idle and NMI-nesting counter and check. */ cur = atomic_read(&rdtp->dynticks_idle); @@ -3068,11 +3069,10 @@ static void rcu_bind_gp_kthread(void) return; #ifdef CONFIG_NO_HZ_FULL_SYSIDLE cpu = tick_do_timer_cpu; - if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu) + if (cpu >= 0 && cpu < nr_cpu_ids) set_cpus_allowed_ptr(current, cpumask_of(cpu)); #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ - if (!is_housekeeping_cpu(raw_smp_processor_id())) - housekeeping_affine(current); + housekeeping_affine(current); #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ } -- cgit v1.3-8-gc7d7 From 39afb5ee4640b4ed2cdd9e12b2a67cf785cfced8 Mon Sep 17 00:00:00 2001 From: Jon DeVree Date: Fri, 27 Feb 2015 15:52:07 -0800 Subject: kernel/sys.c: fix UNAME26 for 4.0 There's a uname workaround for broken userspace which can't handle kernel versions of 3.x. Update it for 4.x. Signed-off-by: Jon DeVree Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 667b2e62fad2..a03d9cd23ed7 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1108,6 +1108,7 @@ DECLARE_RWSEM(uts_sem); /* * Work around broken programs that cannot handle "Linux 3.0". * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40 + * And we map 4.x to 2.6.60+x, so 4.0 would be 2.6.60. */ static int override_release(char __user *release, size_t len) { @@ -1127,7 +1128,7 @@ static int override_release(char __user *release, size_t len) break; rest++; } - v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40; + v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 60; copy = clamp_t(size_t, len, 1, sizeof(buf)); copy = scnprintf(buf, copy, "2.6.%u%s", v, rest); ret = copy_to_user(release, buf, copy + 1); -- cgit v1.3-8-gc7d7 From 01e04f466e12e883907937eb04a9010533363f55 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Feb 2015 00:39:21 +0100 Subject: idle / sleep: Avoid excessive disabling and enabling interrupts Disabling interrupts at the end of cpuidle_enter_freeze() is not useful, because its caller, cpuidle_idle_call(), re-enables them right away after invoking it. To avoid that unnecessary back and forth dance with interrupts, make cpuidle_enter_freeze() enable interrupts after calling enter_freeze_proper() and drop the local_irq_disable() at its end, so that all of the code paths in it end up with interrupts enabled. Then, cpuidle_idle_call() will not need to re-enable interrupts after calling cpuidle_enter_freeze() any more, because the latter will return with interrupts enabled, in analogy with cpuidle_enter(). Reported-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/cpuidle.c | 6 +++--- kernel/sched/idle.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 4d534582514e..b573f584b15a 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -117,6 +117,8 @@ static void enter_freeze_proper(struct cpuidle_driver *drv, * If there are states with the ->enter_freeze callback, find the deepest of * them and enter it with frozen tick. Otherwise, find the deepest state * available and enter it normally. + * + * Returns with enabled interrupts. */ void cpuidle_enter_freeze(void) { @@ -132,6 +134,7 @@ void cpuidle_enter_freeze(void) index = cpuidle_find_deepest_state(drv, dev, true); if (index >= 0) { enter_freeze_proper(drv, dev, index); + local_irq_enable(); return; } @@ -144,9 +147,6 @@ void cpuidle_enter_freeze(void) cpuidle_enter(drv, dev, index); else arch_cpu_idle(); - - /* Interrupts are enabled again here. */ - local_irq_disable(); } /** diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 94b2d7b88a27..f59198bda1bf 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -116,7 +116,6 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { cpuidle_enter_freeze(); - local_irq_enable(); goto exit_idle; } -- cgit v1.3-8-gc7d7 From 9d3e2d02f54160725d97f4ab1e1e8de493fbf33a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 27 Feb 2015 17:57:09 +0100 Subject: locking/rtmutex: Set state back to running on error The "usual" path is: - rt_mutex_slowlock() - set_current_state() - task_blocks_on_rt_mutex() (ret 0) - __rt_mutex_slowlock() - sleep or not but do return with __set_current_state(TASK_RUNNING) - back to caller. In the early error case where task_blocks_on_rt_mutex() return -EDEADLK we never change the task's state back to RUNNING. I assume this is intended. Without this change after ww_mutex using rt_mutex the selftest passes but later I get plenty of: | bad: scheduling from the idle thread! backtraces. Signed-off-by: Sebastian Andrzej Siewior Acked-by: Mike Galbraith Cc: Linus Torvalds Cc: Maarten Lankhorst Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: afffc6c1805d ("locking/rtmutex: Optimize setting task running after being blocked") Link: http://lkml.kernel.org/r/1425056229-22326-4-git-send-email-bigeasy@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e16e5542bf13..6357265a31ad 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1193,6 +1193,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); if (unlikely(ret)) { + __set_current_state(TASK_RUNNING); if (rt_mutex_has_waiters(lock)) remove_waiter(lock, &waiter); rt_mutex_handle_deadlock(ret, chwalk, &waiter); -- cgit v1.3-8-gc7d7 From f91fe17e243d1f279d425071a35e3d41290758a0 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:41 +0100 Subject: ebpf: remove kernel test stubs Now that we have BPF_PROG_TYPE_SOCKET_FILTER up and running, we can remove the test stubs which were added to get the verifier suite up. We can just let the test cases probe under socket filter type instead. In the fill/spill test case, we cannot (yet) access fields from the context (skb), but we may adapt that test case in future. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/Makefile | 3 -- kernel/bpf/test_stub.c | 78 --------------------------------------------- samples/bpf/test_verifier.c | 5 +-- 3 files changed, 3 insertions(+), 83 deletions(-) delete mode 100644 kernel/bpf/test_stub.c (limited to 'kernel') diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index a5ae60f0b0a2..e6983be12bd3 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,5 +1,2 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o hashtab.o arraymap.o helpers.o -ifdef CONFIG_TEST_BPF -obj-$(CONFIG_BPF_SYSCALL) += test_stub.o -endif diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c deleted file mode 100644 index 0ceae1e6e8b5..000000000000 --- a/kernel/bpf/test_stub.c +++ /dev/null @@ -1,78 +0,0 @@ -/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of version 2 of the GNU General Public - * License as published by the Free Software Foundation. - */ -#include -#include -#include -#include -#include - -/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC - * to be used by user space verifier testsuite - */ -struct bpf_context { - u64 arg1; - u64 arg2; -}; - -static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id) -{ - switch (func_id) { - case BPF_FUNC_map_lookup_elem: - return &bpf_map_lookup_elem_proto; - case BPF_FUNC_map_update_elem: - return &bpf_map_update_elem_proto; - case BPF_FUNC_map_delete_elem: - return &bpf_map_delete_elem_proto; - default: - return NULL; - } -} - -static const struct bpf_context_access { - int size; - enum bpf_access_type type; -} test_ctx_access[] = { - [offsetof(struct bpf_context, arg1)] = { - FIELD_SIZEOF(struct bpf_context, arg1), - BPF_READ - }, - [offsetof(struct bpf_context, arg2)] = { - FIELD_SIZEOF(struct bpf_context, arg2), - BPF_READ - }, -}; - -static bool test_is_valid_access(int off, int size, enum bpf_access_type type) -{ - const struct bpf_context_access *access; - - if (off < 0 || off >= ARRAY_SIZE(test_ctx_access)) - return false; - - access = &test_ctx_access[off]; - if (access->size == size && (access->type & type)) - return true; - - return false; -} - -static struct bpf_verifier_ops test_ops = { - .get_func_proto = test_func_proto, - .is_valid_access = test_is_valid_access, -}; - -static struct bpf_prog_type_list tl_prog = { - .ops = &test_ops, - .type = BPF_PROG_TYPE_UNSPEC, -}; - -static int __init register_test_ops(void) -{ - bpf_register_prog_type(&tl_prog); - return 0; -} -late_initcall(register_test_ops); diff --git a/samples/bpf/test_verifier.c b/samples/bpf/test_verifier.c index b96175e90363..7b56b59fad8e 100644 --- a/samples/bpf/test_verifier.c +++ b/samples/bpf/test_verifier.c @@ -288,7 +288,8 @@ static struct bpf_test tests[] = { BPF_LDX_MEM(BPF_DW, BPF_REG_2, BPF_REG_10, -8), /* should be able to access R0 = *(R2 + 8) */ - BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), + /* BPF_LDX_MEM(BPF_DW, BPF_REG_0, BPF_REG_2, 8), */ + BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), }, .result = ACCEPT, @@ -687,7 +688,7 @@ static int test(void) } printf("#%d %s ", i, tests[i].descr); - prog_fd = bpf_prog_load(BPF_PROG_TYPE_UNSPEC, prog, + prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, prog_len * sizeof(struct bpf_insn), "GPL"); -- cgit v1.3-8-gc7d7 From a2c83fff582ae133d9f5bb187404ea9ce4da1f96 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:42 +0100 Subject: ebpf: constify various function pointer structs We can move bpf_map_ops and bpf_verifier_ops and other structs into ro section, bpf_map_type_list and bpf_prog_type_list into read mostly. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 14 +++++++------- kernel/bpf/arraymap.c | 6 +++--- kernel/bpf/hashtab.c | 6 +++--- kernel/bpf/helpers.c | 6 +++--- net/core/filter.c | 6 +++--- 5 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index bbfceb756452..78446860f796 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -32,13 +32,13 @@ struct bpf_map { u32 key_size; u32 value_size; u32 max_entries; - struct bpf_map_ops *ops; + const struct bpf_map_ops *ops; struct work_struct work; }; struct bpf_map_type_list { struct list_head list_node; - struct bpf_map_ops *ops; + const struct bpf_map_ops *ops; enum bpf_map_type type; }; @@ -109,7 +109,7 @@ struct bpf_verifier_ops { struct bpf_prog_type_list { struct list_head list_node; - struct bpf_verifier_ops *ops; + const struct bpf_verifier_ops *ops; enum bpf_prog_type type; }; @@ -121,7 +121,7 @@ struct bpf_prog_aux { atomic_t refcnt; bool is_gpl_compatible; enum bpf_prog_type prog_type; - struct bpf_verifier_ops *ops; + const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; u32 used_map_cnt; struct bpf_prog *prog; @@ -138,8 +138,8 @@ struct bpf_prog *bpf_prog_get(u32 ufd); int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); /* verifier prototypes for helper functions called from eBPF programs */ -extern struct bpf_func_proto bpf_map_lookup_elem_proto; -extern struct bpf_func_proto bpf_map_update_elem_proto; -extern struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_map_lookup_elem_proto; +extern const struct bpf_func_proto bpf_map_update_elem_proto; +extern const struct bpf_func_proto bpf_map_delete_elem_proto; #endif /* _LINUX_BPF_H */ diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 9eb4d8a7cd87..8a6616583f38 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -134,7 +134,7 @@ static void array_map_free(struct bpf_map *map) kvfree(array); } -static struct bpf_map_ops array_ops = { +static const struct bpf_map_ops array_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -143,14 +143,14 @@ static struct bpf_map_ops array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list tl = { +static struct bpf_map_type_list array_type __read_mostly = { .ops = &array_ops, .type = BPF_MAP_TYPE_ARRAY, }; static int __init register_array_map(void) { - bpf_register_map_type(&tl); + bpf_register_map_type(&array_type); return 0; } late_initcall(register_array_map); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index b3ba43674310..83c209d9b17a 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -345,7 +345,7 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } -static struct bpf_map_ops htab_ops = { +static const struct bpf_map_ops htab_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -354,14 +354,14 @@ static struct bpf_map_ops htab_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list tl = { +static struct bpf_map_type_list htab_type __read_mostly = { .ops = &htab_ops, .type = BPF_MAP_TYPE_HASH, }; static int __init register_htab_map(void) { - bpf_register_map_type(&tl); + bpf_register_map_type(&htab_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 9e3414d85459..a3c7701a8b5e 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -41,7 +41,7 @@ static u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return (unsigned long) value; } -struct bpf_func_proto bpf_map_lookup_elem_proto = { +const struct bpf_func_proto bpf_map_lookup_elem_proto = { .func = bpf_map_lookup_elem, .gpl_only = false, .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL, @@ -60,7 +60,7 @@ static u64 bpf_map_update_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return map->ops->map_update_elem(map, key, value, r4); } -struct bpf_func_proto bpf_map_update_elem_proto = { +const struct bpf_func_proto bpf_map_update_elem_proto = { .func = bpf_map_update_elem, .gpl_only = false, .ret_type = RET_INTEGER, @@ -80,7 +80,7 @@ static u64 bpf_map_delete_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) return map->ops->map_delete_elem(map, key); } -struct bpf_func_proto bpf_map_delete_elem_proto = { +const struct bpf_func_proto bpf_map_delete_elem_proto = { .func = bpf_map_delete_elem, .gpl_only = false, .ret_type = RET_INTEGER, diff --git a/net/core/filter.c b/net/core/filter.c index f6bdc2b1ba01..6fe09e36dad9 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1159,19 +1159,19 @@ static bool sock_filter_is_valid_access(int off, int size, enum bpf_access_type return false; } -static struct bpf_verifier_ops sock_filter_ops = { +static const struct bpf_verifier_ops sock_filter_ops = { .get_func_proto = sock_filter_func_proto, .is_valid_access = sock_filter_is_valid_access, }; -static struct bpf_prog_type_list tl = { +static struct bpf_prog_type_list sock_filter_type __read_mostly = { .ops = &sock_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static int __init register_sock_filter_ops(void) { - bpf_register_prog_type(&tl); + bpf_register_prog_type(&sock_filter_type); return 0; } late_initcall(register_sock_filter_ops); -- cgit v1.3-8-gc7d7 From 96be4325f443dbbfeb37d2a157675ac0736531a1 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:46 +0100 Subject: ebpf: add sched_cls_type and map it to sk_filter's verifier ops As discussed recently and at netconf/netdev01, we want to prevent making bpf_verifier_ops registration available for modules, but have them at a controlled place inside the kernel instead. The reason for this is, that out-of-tree modules can go crazy and define and register any verfifier ops they want, doing all sorts of crap, even bypassing available GPLed eBPF helper functions. We don't want to offer such a shiny playground, of course, but keep strict control to ourselves inside the core kernel. This also encourages us to design eBPF user helpers carefully and generically, so they can be shared among various subsystems using eBPF. For the eBPF traffic classifier (cls_bpf), it's a good start to share the same helper facilities as we currently do in eBPF for socket filters. That way, we have BPF_PROG_TYPE_SCHED_CLS look like it's own type, thus one day if there's a good reason to diverge the set of helper functions from the set available to socket filters, we keep ABI compatibility. In future, we could place all bpf_prog_type_list at a central place, perhaps. Signed-off-by: Daniel Borkmann Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 15 +++++++++++++-- net/core/filter.c | 7 +++++++ 3 files changed, 21 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0248180bf2e2..3fa1af8a58d7 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_SCHED_CLS, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a28e09c7825d..594d341f04db 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1172,6 +1172,17 @@ static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn) return 0; } +static bool may_access_skb(enum bpf_prog_type type) +{ + switch (type) { + case BPF_PROG_TYPE_SOCKET_FILTER: + case BPF_PROG_TYPE_SCHED_CLS: + return true; + default: + return false; + } +} + /* verify safety of LD_ABS|LD_IND instructions: * - they can only appear in the programs where ctx == skb * - since they are wrappers of function calls, they scratch R1-R5 registers, @@ -1194,8 +1205,8 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *reg; int i, err; - if (env->prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { - verbose("BPF_LD_ABS|IND instructions are only allowed in socket filters\n"); + if (!may_access_skb(env->prog->aux->prog_type)) { + verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); return -EINVAL; } diff --git a/net/core/filter.c b/net/core/filter.c index 741721233166..514d4082f326 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1166,9 +1166,16 @@ static struct bpf_prog_type_list sk_filter_type __read_mostly = { .type = BPF_PROG_TYPE_SOCKET_FILTER, }; +static struct bpf_prog_type_list sched_cls_type __read_mostly = { + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SCHED_CLS, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); + bpf_register_prog_type(&sched_cls_type); + return 0; } late_initcall(register_sk_filter_ops); -- cgit v1.3-8-gc7d7 From 24701ecea76b0b93bd9667486934ec310825f558 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:47 +0100 Subject: ebpf: move read-only fields to bpf_prog and shrink bpf_prog_aux is_gpl_compatible and prog_type should be moved directly into bpf_prog as they stay immutable during bpf_prog's lifetime, are core attributes and they can be locked as read-only later on via bpf_prog_select_runtime(). With a bit of rearranging, this also allows us to shrink bpf_prog_aux to exactly 1 cacheline. Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 +--- include/linux/filter.h | 4 +++- kernel/bpf/syscall.c | 7 +++---- kernel/bpf/verifier.c | 4 ++-- net/core/filter.c | 4 ++-- 5 files changed, 11 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 9c458144cdb4..a1a7ff2df328 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -117,11 +117,9 @@ struct bpf_prog; struct bpf_prog_aux { atomic_t refcnt; - bool is_gpl_compatible; - enum bpf_prog_type prog_type; + u32 used_map_cnt; const struct bpf_verifier_ops *ops; struct bpf_map **used_maps; - u32 used_map_cnt; struct bpf_prog *prog; struct work_struct work; }; diff --git a/include/linux/filter.h b/include/linux/filter.h index 5e3863d5f666..9ee8c67ea249 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -308,9 +308,11 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ bool jited; /* Is our filter JIT'ed? */ + bool gpl_compatible; /* Is our filter GPL compatible? */ u32 len; /* Number of filter blocks */ - struct sock_fprog_kern *orig_prog; /* Original BPF program */ + enum bpf_prog_type type; /* Type of BPF program */ struct bpf_prog_aux *aux; /* Auxiliary fields */ + struct sock_fprog_kern *orig_prog; /* Original BPF program */ unsigned int (*bpf_func)(const struct sk_buff *skb, const struct bpf_insn *filter); /* Instructions for interpreter */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..0d69449acbd0 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -354,10 +354,11 @@ static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) list_for_each_entry(tl, &bpf_prog_types, list_node) { if (tl->type == type) { prog->aux->ops = tl->ops; - prog->aux->prog_type = type; + prog->type = type; return 0; } } + return -EINVAL; } @@ -508,7 +509,7 @@ static int bpf_prog_load(union bpf_attr *attr) prog->jited = false; atomic_set(&prog->aux->refcnt, 1); - prog->aux->is_gpl_compatible = is_gpl; + prog->gpl_compatible = is_gpl; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); @@ -517,7 +518,6 @@ static int bpf_prog_load(union bpf_attr *attr) /* run eBPF verifier */ err = bpf_check(prog, attr); - if (err < 0) goto free_used_maps; @@ -528,7 +528,6 @@ static int bpf_prog_load(union bpf_attr *attr) bpf_prog_select_runtime(prog); err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC); - if (err < 0) /* failed to allocate fd */ goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 594d341f04db..bdf4192a889b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -852,7 +852,7 @@ static int check_call(struct verifier_env *env, int func_id) } /* eBPF programs must be GPL compatible to use GPL-ed functions */ - if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) { + if (!env->prog->gpl_compatible && fn->gpl_only) { verbose("cannot call GPL only function from proprietary program\n"); return -EINVAL; } @@ -1205,7 +1205,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) struct reg_state *reg; int i, err; - if (!may_access_skb(env->prog->aux->prog_type)) { + if (!may_access_skb(env->prog->type)) { verbose("BPF_LD_ABS|IND instructions not allowed for this program type\n"); return -EINVAL; } diff --git a/net/core/filter.c b/net/core/filter.c index 514d4082f326..ff000cb25e0a 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -814,7 +814,7 @@ static void bpf_release_orig_filter(struct bpf_prog *fp) static void __bpf_prog_release(struct bpf_prog *prog) { - if (prog->aux->prog_type == BPF_PROG_TYPE_SOCKET_FILTER) { + if (prog->type == BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); } else { bpf_release_orig_filter(prog); @@ -1105,7 +1105,7 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) if (IS_ERR(prog)) return PTR_ERR(prog); - if (prog->aux->prog_type != BPF_PROG_TYPE_SOCKET_FILTER) { + if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) { bpf_prog_put(prog); return -EINVAL; } -- cgit v1.3-8-gc7d7 From e2e9b6541dd4b31848079da80fe2253daaafb549 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sun, 1 Mar 2015 12:31:48 +0100 Subject: cls_bpf: add initial eBPF support for programmable classifiers This work extends the "classic" BPF programmable tc classifier by extending its scope also to native eBPF code! This allows for user space to implement own custom, 'safe' C like classifiers (or whatever other frontend language LLVM et al may provide in future), that can then be compiled with the LLVM eBPF backend to an eBPF elf file. The result of this can be loaded into the kernel via iproute2's tc. In the kernel, they can be JITed on major archs and thus run in native performance. Simple, minimal toy example to demonstrate the workflow: #include #include #include #include "tc_bpf_api.h" __section("classify") int cls_main(struct sk_buff *skb) { return (0x800 << 16) | load_byte(skb, ETH_HLEN + __builtin_offsetof(struct iphdr, tos)); } char __license[] __section("license") = "GPL"; The classifier can then be compiled into eBPF opcodes and loaded via tc, for example: clang -O2 -emit-llvm -c cls.c -o - | llc -march=bpf -filetype=obj -o cls.o tc filter add dev em1 parent 1: bpf cls.o [...] As it has been demonstrated, the scope can even reach up to a fully fledged flow dissector (similarly as in samples/bpf/sockex2_kern.c). For tc, maps are allowed to be used, but from kernel context only, in other words, eBPF code can keep state across filter invocations. In future, we perhaps may reattach from a different application to those maps e.g., to read out collected statistics/state. Similarly as in socket filters, we may extend functionality for eBPF classifiers over time depending on the use cases. For that purpose, cls_bpf programs are using BPF_PROG_TYPE_SCHED_CLS program type, so we can allow additional functions/accessors (e.g. an ABI compatible offset translation to skb fields/metadata). For an initial cls_bpf support, we allow the same set of helper functions as eBPF socket filters, but we could diverge at some point in time w/o problem. I was wondering whether cls_bpf and act_bpf could share C programs, I can imagine that at some point, we introduce i) further common handlers for both (or even beyond their scope), and/or if truly needed ii) some restricted function space for each of them. Both can be abstracted easily through struct bpf_verifier_ops in future. The context of cls_bpf versus act_bpf is slightly different though: a cls_bpf program will return a specific classid whereas act_bpf a drop/non-drop return code, latter may also in future mangle skbs. That said, we can surely have a "classify" and "action" section in a single object file, or considered mentioned constraint add a possibility of a shared section. The workflow for getting native eBPF running from tc [1] is as follows: for f_bpf, I've added a slightly modified ELF parser code from Alexei's kernel sample, which reads out the LLVM compiled object, sets up maps (and dynamically fixes up map fds) if any, and loads the eBPF instructions all centrally through the bpf syscall. The resulting fd from the loaded program itself is being passed down to cls_bpf, which looks up struct bpf_prog from the fd store, and holds reference, so that it stays available also after tc program lifetime. On tc filter destruction, it will then drop its reference. Moreover, I've also added the optional possibility to annotate an eBPF filter with a name (e.g. path to object file, or something else if preferred) so that when tc dumps currently installed filters, some more context can be given to an admin for a given instance (as opposed to just the file descriptor number). Last but not least, bpf_prog_get() and bpf_prog_put() needed to be exported, so that eBPF can be used from cls_bpf built as a module. Thanks to 60a3b2253c41 ("net: bpf: make eBPF interpreter images read-only") I think this is of no concern since anything wanting to alter eBPF opcode after verification stage would crash the kernel. [1] http://git.breakpoint.cc/cgit/dborkman/iproute2.git/log/?h=ebpf Signed-off-by: Daniel Borkmann Cc: Jamal Hadi Salim Cc: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/pkt_cls.h | 2 + kernel/bpf/syscall.c | 2 + net/sched/cls_bpf.c | 206 ++++++++++++++++++++++++++++++++----------- 3 files changed, 158 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 25731dfb3fcc..bf08e76bf505 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -397,6 +397,8 @@ enum { TCA_BPF_CLASSID, TCA_BPF_OPS_LEN, TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, __TCA_BPF_MAX, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d69449acbd0..669719ccc9ee 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -419,6 +419,7 @@ void bpf_prog_put(struct bpf_prog *prog) bpf_prog_free(prog); } } +EXPORT_SYMBOL_GPL(bpf_prog_put); static int bpf_prog_release(struct inode *inode, struct file *filp) { @@ -466,6 +467,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) fdput(f); return prog; } +EXPORT_SYMBOL_GPL(bpf_prog_get); /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD log_buf diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 5f3ee9e4b5bf..6f7ed8f8e6ee 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -16,6 +16,8 @@ #include #include #include +#include + #include #include #include @@ -24,6 +26,8 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Daniel Borkmann "); MODULE_DESCRIPTION("TC BPF based classifier"); +#define CLS_BPF_NAME_LEN 256 + struct cls_bpf_head { struct list_head plist; u32 hgen; @@ -32,18 +36,24 @@ struct cls_bpf_head { struct cls_bpf_prog { struct bpf_prog *filter; - struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct tcf_result res; struct list_head link; + struct tcf_result res; + struct tcf_exts exts; u32 handle; - u16 bpf_num_ops; + union { + u32 bpf_fd; + u16 bpf_num_ops; + }; + struct sock_filter *bpf_ops; + const char *bpf_name; struct tcf_proto *tp; struct rcu_head rcu; }; static const struct nla_policy bpf_policy[TCA_BPF_MAX + 1] = { [TCA_BPF_CLASSID] = { .type = NLA_U32 }, + [TCA_BPF_FD] = { .type = NLA_U32 }, + [TCA_BPF_NAME] = { .type = NLA_NUL_STRING, .len = CLS_BPF_NAME_LEN }, [TCA_BPF_OPS_LEN] = { .type = NLA_U16 }, [TCA_BPF_OPS] = { .type = NLA_BINARY, .len = sizeof(struct sock_filter) * BPF_MAXINSNS }, @@ -76,6 +86,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; } +static bool cls_bpf_is_ebpf(const struct cls_bpf_prog *prog) +{ + return !prog->bpf_ops; +} + static int cls_bpf_init(struct tcf_proto *tp) { struct cls_bpf_head *head; @@ -94,8 +109,12 @@ static void cls_bpf_delete_prog(struct tcf_proto *tp, struct cls_bpf_prog *prog) { tcf_exts_destroy(&prog->exts); - bpf_prog_destroy(prog->filter); + if (cls_bpf_is_ebpf(prog)) + bpf_prog_put(prog->filter); + else + bpf_prog_destroy(prog->filter); + kfree(prog->bpf_name); kfree(prog->bpf_ops); kfree(prog); } @@ -114,6 +133,7 @@ static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) list_del_rcu(&prog->link); tcf_unbind_filter(tp, &prog->res); call_rcu(&prog->rcu, __cls_bpf_delete_prog); + return 0; } @@ -151,69 +171,121 @@ static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) return ret; } -static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, - struct cls_bpf_prog *prog, - unsigned long base, struct nlattr **tb, - struct nlattr *est, bool ovr) +static int cls_bpf_prog_from_ops(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) { struct sock_filter *bpf_ops; - struct tcf_exts exts; - struct sock_fprog_kern tmp; + struct sock_fprog_kern fprog_tmp; struct bpf_prog *fp; u16 bpf_size, bpf_num_ops; - u32 classid; int ret; - if (!tb[TCA_BPF_OPS_LEN] || !tb[TCA_BPF_OPS] || !tb[TCA_BPF_CLASSID]) - return -EINVAL; - - tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); - ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); - if (ret < 0) - return ret; - - classid = nla_get_u32(tb[TCA_BPF_CLASSID]); bpf_num_ops = nla_get_u16(tb[TCA_BPF_OPS_LEN]); - if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) { - ret = -EINVAL; - goto errout; - } + if (bpf_num_ops > BPF_MAXINSNS || bpf_num_ops == 0) + return -EINVAL; bpf_size = bpf_num_ops * sizeof(*bpf_ops); - if (bpf_size != nla_len(tb[TCA_BPF_OPS])) { - ret = -EINVAL; - goto errout; - } + if (bpf_size != nla_len(tb[TCA_BPF_OPS])) + return -EINVAL; bpf_ops = kzalloc(bpf_size, GFP_KERNEL); - if (bpf_ops == NULL) { - ret = -ENOMEM; - goto errout; - } + if (bpf_ops == NULL) + return -ENOMEM; memcpy(bpf_ops, nla_data(tb[TCA_BPF_OPS]), bpf_size); - tmp.len = bpf_num_ops; - tmp.filter = bpf_ops; + fprog_tmp.len = bpf_num_ops; + fprog_tmp.filter = bpf_ops; - ret = bpf_prog_create(&fp, &tmp); - if (ret) - goto errout_free; + ret = bpf_prog_create(&fp, &fprog_tmp); + if (ret < 0) { + kfree(bpf_ops); + return ret; + } - prog->bpf_num_ops = bpf_num_ops; prog->bpf_ops = bpf_ops; + prog->bpf_num_ops = bpf_num_ops; + prog->bpf_name = NULL; + prog->filter = fp; prog->res.classid = classid; + return 0; +} + +static int cls_bpf_prog_from_efd(struct nlattr **tb, + struct cls_bpf_prog *prog, u32 classid) +{ + struct bpf_prog *fp; + char *name = NULL; + u32 bpf_fd; + + bpf_fd = nla_get_u32(tb[TCA_BPF_FD]); + + fp = bpf_prog_get(bpf_fd); + if (IS_ERR(fp)) + return PTR_ERR(fp); + + if (fp->type != BPF_PROG_TYPE_SCHED_CLS) { + bpf_prog_put(fp); + return -EINVAL; + } + + if (tb[TCA_BPF_NAME]) { + name = kmemdup(nla_data(tb[TCA_BPF_NAME]), + nla_len(tb[TCA_BPF_NAME]), + GFP_KERNEL); + if (!name) { + bpf_prog_put(fp); + return -ENOMEM; + } + } + + prog->bpf_ops = NULL; + prog->bpf_fd = bpf_fd; + prog->bpf_name = name; + + prog->filter = fp; + prog->res.classid = classid; + + return 0; +} + +static int cls_bpf_modify_existing(struct net *net, struct tcf_proto *tp, + struct cls_bpf_prog *prog, + unsigned long base, struct nlattr **tb, + struct nlattr *est, bool ovr) +{ + struct tcf_exts exts; + bool is_bpf, is_ebpf; + u32 classid; + int ret; + + is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; + is_ebpf = tb[TCA_BPF_FD]; + + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf) || + !tb[TCA_BPF_CLASSID]) + return -EINVAL; + + tcf_exts_init(&exts, TCA_BPF_ACT, TCA_BPF_POLICE); + ret = tcf_exts_validate(net, tp, tb, est, &exts, ovr); + if (ret < 0) + return ret; + + classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog, classid) : + cls_bpf_prog_from_efd(tb, prog, classid); + if (ret < 0) { + tcf_exts_destroy(&exts); + return ret; + } + tcf_bind_filter(tp, &prog->res, base); tcf_exts_change(tp, &prog->exts, &exts); return 0; -errout_free: - kfree(bpf_ops); -errout: - tcf_exts_destroy(&exts); - return ret; } static u32 cls_bpf_grab_new_handle(struct tcf_proto *tp, @@ -297,11 +369,43 @@ errout: return ret; } +static int cls_bpf_dump_bpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + struct nlattr *nla; + + if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) + return -EMSGSIZE; + + nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * + sizeof(struct sock_filter)); + if (nla == NULL) + return -EMSGSIZE; + + memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); + + return 0; +} + +static int cls_bpf_dump_ebpf_info(const struct cls_bpf_prog *prog, + struct sk_buff *skb) +{ + if (nla_put_u32(skb, TCA_BPF_FD, prog->bpf_fd)) + return -EMSGSIZE; + + if (prog->bpf_name && + nla_put_string(skb, TCA_BPF_NAME, prog->bpf_name)) + return -EMSGSIZE; + + return 0; +} + static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, struct sk_buff *skb, struct tcmsg *tm) { struct cls_bpf_prog *prog = (struct cls_bpf_prog *) fh; - struct nlattr *nest, *nla; + struct nlattr *nest; + int ret; if (prog == NULL) return skb->len; @@ -314,16 +418,14 @@ static int cls_bpf_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, if (nla_put_u32(skb, TCA_BPF_CLASSID, prog->res.classid)) goto nla_put_failure; - if (nla_put_u16(skb, TCA_BPF_OPS_LEN, prog->bpf_num_ops)) - goto nla_put_failure; - nla = nla_reserve(skb, TCA_BPF_OPS, prog->bpf_num_ops * - sizeof(struct sock_filter)); - if (nla == NULL) + if (cls_bpf_is_ebpf(prog)) + ret = cls_bpf_dump_ebpf_info(prog, skb); + else + ret = cls_bpf_dump_bpf_info(prog, skb); + if (ret) goto nla_put_failure; - memcpy(nla_data(nla), prog->bpf_ops, nla_len(nla)); - if (tcf_exts_dump(skb, &prog->exts) < 0) goto nla_put_failure; -- cgit v1.3-8-gc7d7 From 790317e1b266c776765a4bdcedefea706ff0fada Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 13 Feb 2015 11:19:49 +0800 Subject: cpuset: initialize effective masks when clone_children is enabled If clone_children is enabled, effective masks won't be initialized due to the bug: # mount -t cgroup -o cpuset xxx /mnt # echo 1 > cgroup.clone_children # mkdir /mnt/tmp # cat /mnt/tmp/ # cat cpuset.effective_cpus # cat cpuset.cpus 0-15 And then this cpuset won't constrain the tasks in it. Either the bug or the fix has no effect on unified hierarchy, as there's no clone_chidren flag there any more. Reported-by: Christian Brauner Reported-by: Serge Hallyn Cc: # 3.17+ Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Tested-by: Serge Hallyn --- kernel/cpuset.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1d1fe9361d29..89d4ed08afba 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1979,7 +1979,9 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; + cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); + cpumask_copy(cs->effective_cpus, parent->cpus_allowed); spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); -- cgit v1.3-8-gc7d7 From 79063bffc81f82689bd90e16da1b49408f3bf095 Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Fri, 13 Feb 2015 11:20:30 +0800 Subject: cpuset: fix a warning when clearing configured masks in old hierarchy When we clear cpuset.cpus, cpuset.effective_cpus won't be cleared: # mount -t cgroup -o cpuset xxx /mnt # mkdir /mnt/tmp # echo 0 > /mnt/tmp/cpuset.cpus # echo > /mnt/tmp/cpuset.cpus # cat cpuset.cpus # cat cpuset.effective_cpus 0-15 And a kernel warning in update_cpumasks_hier() is triggered: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 4028 at kernel/cpuset.c:894 update_cpumasks_hier+0x471/0x650() Cc: # 3.17+ Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Tested-by: Serge Hallyn --- kernel/cpuset.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 89d4ed08afba..407611ba371b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -873,7 +873,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some CPUs. */ - if (cpumask_empty(new_cpus)) + if (cgroup_on_dfl(cp->css.cgroup) && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); /* Skip the whole subtree if the cpumask remains the same. */ @@ -1129,7 +1129,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) * If it becomes empty, inherit the effective mask of the * parent, which is guaranteed to have some MEMs. */ - if (nodes_empty(*new_mems)) + if (cgroup_on_dfl(cp->css.cgroup) && nodes_empty(*new_mems)) *new_mems = parent->effective_mems; /* Skip the whole subtree if the nodemask remains the same. */ -- cgit v1.3-8-gc7d7 From 283cb41f426b723a0255702b761b0fc5d1b53a81 Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 13 Feb 2015 11:58:07 +0800 Subject: cpuset: Fix cpuset sched_relax_domain_level The cpuset.sched_relax_domain_level can control how far we do immediate load balancing on a system. However, it was found on recent kernels that echo'ing a value into cpuset.sched_relax_domain_level did not reduce any immediate load balancing. The reason this occurred was because the update_domain_attr_tree() traversal did not update for the "top_cpuset". This resulted in nothing being changed when modifying the sched_relax_domain_level parameter. This patch is able to address that problem by having update_domain_attr_tree() allow updates for the root in the cpuset traversal. Fixes: fc560a26acce ("cpuset: replace cpuset->stack_list with cpuset_for_each_descendant_pre()") Cc: # 3.9+ Signed-off-by: Jason Low Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Tested-by: Serge Hallyn --- kernel/cpuset.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 407611ba371b..fc7f4748d34a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -548,9 +548,6 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr, rcu_read_lock(); cpuset_for_each_descendant_pre(cp, pos_css, root_cs) { - if (cp == root_cs) - continue; - /* skip the whole subtree if @cp doesn't have any CPU */ if (cpumask_empty(cp->cpus_allowed)) { pos_css = css_rightmost_descendant(pos_css); -- cgit v1.3-8-gc7d7 From 295458e67284f57d154ec8156a22797c0cfb044a Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Thu, 19 Feb 2015 17:34:46 +0300 Subject: cgroup: call cgroup_subsys->bind on cgroup subsys initialization Currently, we call cgroup_subsys->bind only on unmount, remount, and when creating a new root on mount. Since the default hierarchy root is created in cgroup_init, we will not call cgroup_subsys->bind if the default hierarchy is freshly mounted. As a result, some controllers will behave incorrectly (most notably, the "memory" controller will not enable hierarchy support). Fix this by calling cgroup_subsys->bind right after initializing a cgroup subsystem. Signed-off-by: Vladimir Davydov Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 29a7b2cc593e..21a4b6d61e21 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -5040,6 +5040,9 @@ int __init cgroup_init(void) WARN_ON(cgroup_add_dfl_cftypes(ss, ss->dfl_cftypes)); WARN_ON(cgroup_add_legacy_cftypes(ss, ss->legacy_cftypes)); } + + if (ss->bind) + ss->bind(init_css_set.subsys[ssid]); } cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); -- cgit v1.3-8-gc7d7 From dfcacc154fb38fdb2c243c3dbbdc1f26a64cedc8 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 2 Mar 2015 22:25:37 +0100 Subject: cpuidle: Clean up fallback handling in cpuidle_idle_call() Move the fallback code path in cpuidle_idle_call() to the end of the function to avoid jumping to a label in an if () branch. Signed-off-by: Rafael J. Wysocki --- kernel/sched/idle.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index f59198bda1bf..84b93b68482a 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -124,20 +124,8 @@ static void cpuidle_idle_call(void) * Fall back to the default arch idle method on errors. */ next_state = cpuidle_select(drv, dev); - if (next_state < 0) { -use_default: - /* - * We can't use the cpuidle framework, let's use the default - * idle routine. - */ - if (current_clr_polling_and_test()) - local_irq_enable(); - else - arch_cpu_idle(); - - goto exit_idle; - } - + if (next_state < 0) + goto use_default; /* * The idle task must be scheduled, it is pointless to @@ -195,6 +183,19 @@ exit_idle: rcu_idle_exit(); start_critical_timings(); + return; + +use_default: + /* + * We can't use the cpuidle framework, let's use the default + * idle routine. + */ + if (current_clr_polling_and_test()) + local_irq_enable(); + else + arch_cpu_idle(); + + goto exit_idle; } /* -- cgit v1.3-8-gc7d7 From 2ed11312eb19506c027e7cac039994ad42a9cb2c Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 2 Mar 2015 02:14:26 -0500 Subject: Revert "perf: Remove the extra validity check on nr_pages" This reverts commit 74390aa55678 ("perf: Remove the extra validity check on nr_pages") nr_pages equals to number of pages - 1 in perf_mmap. So nr_pages = 0 is valid. So the nr_pages != 0 && !is_power_of_2(nr_pages) are all needed for checking. Otherwise, for example, perf test 6 failed. # perf test 6 6: x86 rdpmc test :Error: mmap() syscall returned with (Invalid argument) FAILED! Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Kaixu Xia Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1425280466-7830-1-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index af924bc38121..8bb20cc39a92 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4446,7 +4446,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) * If we have rb pages ensure they're a power-of-two number, so we * can do bitmasks instead of modulo. */ - if (!is_power_of_2(nr_pages)) + if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) -- cgit v1.3-8-gc7d7 From c064a0de1bfb07c34a3798822c7e1636eea866e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 28 Feb 2015 22:24:48 +0100 Subject: livepatch: fix RCU usage in klp_find_external_symbol() While one must hold RCU-sched (aka. preempt_disable) for find_symbol() one must equally hold it over the use of the object returned. The moment you release the RCU-sched read lock, the object can be dead and gone. [jkosina@suse.cz: change subject line to be aligned with other patches] Cc: Seth Jennings Cc: Josh Poimboeuf Cc: Masami Hiramatsu Cc: Miroslav Benes Cc: Petr Mladek Cc: Jiri Kosina Cc: "Paul E. McKenney" Cc: Rusty Russell Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Masami Hiramatsu Acked-by: Paul E. McKenney Acked-by: Josh Poimboeuf Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 782172f073c5..01ca08804f51 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -248,11 +248,12 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, /* first, check if it's an exported symbol */ preempt_disable(); sym = find_symbol(name, NULL, NULL, true, true); - preempt_enable(); if (sym) { *addr = sym->value; + preempt_enable(); return 0; } + preempt_enable(); /* otherwise check if it's in another .o within the patch module */ return klp_find_object_symbol(pmod->name, name, addr); -- cgit v1.3-8-gc7d7 From 587945147cfe48c84dd92c022e25aad6d92c0da8 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Mon, 2 Mar 2015 17:51:10 -0500 Subject: cgroup: Use kvfree in pidlist_free() The wrapper already calls the appropriate free function, use it instead of spinning our own. Signed-off-by: Bandan Das Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cgroup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 21a4b6d61e21..a220fdb66568 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3806,10 +3806,7 @@ static void *pidlist_allocate(int count) static void pidlist_free(void *p) { - if (is_vmalloc_addr(p)) - vfree(p); - else - kfree(p); + kvfree(p); } /* -- cgit v1.3-8-gc7d7 From 34404ca8fb252ccee662c4368c555ccf774acc3b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 20:39:20 -0800 Subject: rcu: Move early-boot callbacks to no-CBs lists for no-CBs CPUs When a CPU is first determined to be a no-CBs CPUs, this commit causes any early boot callbacks to be moved to the no-CBs callback list, allowing them to be invoked. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 1 + kernel/rcu/tree_plugin.h | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 92fd3eab5823..0317bf7d997f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2851,6 +2851,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), * and then drop through to queue the callback. */ BUG_ON(cpu != -1); + WARN_ON_ONCE(!rcu_is_watching()); if (!likely(rdp->nxtlist)) init_default_callback_list(rdp); } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 75d5f096bcb0..afddd5641bea 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -2393,18 +2393,8 @@ void __init rcu_init_nohz(void) pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); for_each_rcu_flavor(rsp) { - for_each_cpu(cpu, rcu_nocb_mask) { - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - - /* - * If there are early callbacks, they will need - * to be moved to the nocb lists. - */ - WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] != - &rdp->nxtlist && - rdp->nxttail[RCU_NEXT_TAIL] != NULL); - init_nocb_callback_list(rdp); - } + for_each_cpu(cpu, rcu_nocb_mask) + init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu)); rcu_organize_nocb_kthreads(rsp); } } @@ -2541,6 +2531,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) if (!rcu_is_nocb_cpu(rdp->cpu)) return false; + /* If there are early-boot callbacks, move them to nocb lists. */ + if (rdp->nxtlist) { + rdp->nocb_head = rdp->nxtlist; + rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; + atomic_long_set(&rdp->nocb_q_count, rdp->qlen); + atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); + rdp->nxtlist = NULL; + rdp->qlen = 0; + rdp->qlen_lazy = 0; + } rdp->nxttail[RCU_NEXT_TAIL] = NULL; return true; } -- cgit v1.3-8-gc7d7 From 476276781095c79580abe27a65988549ac7f5f89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 21:10:21 -0800 Subject: rcu: Move early boot callback tests earlier Because callbacks can now be posted quite early in boot, move the early boot callback tests to precede RCU initialization. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 0317bf7d997f..c8e6569c5fbd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3948,6 +3948,8 @@ void __init rcu_init(void) { int cpu; + rcu_early_boot_tests(); + rcu_bootup_announce(); rcu_init_geometry(); rcu_init_one(&rcu_bh_state, &rcu_bh_data); @@ -3964,8 +3966,6 @@ void __init rcu_init(void) pm_notifier(rcu_pm_notify, 0); for_each_online_cpu(cpu) rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); - - rcu_early_boot_tests(); } #include "tree_plugin.h" -- cgit v1.3-8-gc7d7 From 6629240575992a6f0d18c46f5160b34527b0e501 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 19:16:38 -0800 Subject: rcu: Use IS_ENABLED() to CONFIG_RCU_FANOUT_EXACT #ifdef This commit uses IS_ENABLED() to remove the #ifdef from the rcu_init_levelspread() functions. No effect on executable code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 4e37c7fd9e29..35e1604f7e3e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3734,30 +3734,26 @@ void rcu_scheduler_starting(void) * Compute the per-level fanout, either using the exact fanout specified * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. */ -#ifdef CONFIG_RCU_FANOUT_EXACT static void __init rcu_init_levelspread(struct rcu_state *rsp) { int i; - rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - rsp->levelspread[i] = CONFIG_RCU_FANOUT; -} -#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ -static void __init rcu_init_levelspread(struct rcu_state *rsp) -{ - int ccur; - int cprv; - int i; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = rsp->levelcnt[i]; - rsp->levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; + if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) { + rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + rsp->levelspread[i] = CONFIG_RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = rsp->levelcnt[i]; + rsp->levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } } } -#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ /* * Helper function for rcu_init() that initializes one rcu_state structure. -- cgit v1.3-8-gc7d7 From d24209bb689e2c7f7418faec9b4a948e922d24da Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Jan 2015 15:26:03 -0800 Subject: rcu: Improve diagnostics for blocked critical sections in irq If an RCU read-side critical section occurs within an interrupt handler or a softirq handler, it cannot have been preempted. Therefore, there is a check in rcu_read_unlock_special() checking for this error. However, when this check triggers, it lacks diagnostic information. This commit therefore moves rcu_read_unlock()'s lockdep annotation to follow the call to __rcu_read_unlock() and changes rcu_read_unlock_special()'s WARN_ON_ONCE() to an lockdep_rcu_suspicious() in order to locate where the offending RCU read-side critical section began. In addition, the value of the ->rcu_read_unlock_special field is printed. Signed-off-by: Paul E. McKenney --- include/linux/lockdep.h | 7 ++++++- include/linux/rcupdate.h | 2 +- kernel/rcu/tree_plugin.h | 8 +++++++- 3 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 74ab23176e9b..066ba4157541 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -531,8 +531,13 @@ do { \ # define might_lock_read(lock) do { } while (0) #endif -#ifdef CONFIG_PROVE_RCU +#ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); +#else +static inline void +lockdep_rcu_suspicious(const char *file, const int line, const char *s) +{ +} #endif #endif /* __LINUX_LOCKDEP_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3e6afed51051..70b896e16f19 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -942,9 +942,9 @@ static inline void rcu_read_unlock(void) { rcu_lockdep_assert(rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); - rcu_lock_release(&rcu_lock_map); __release(RCU); __rcu_read_unlock(); + rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ } /** diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..8a33920b8845 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -334,7 +334,13 @@ void rcu_read_unlock_special(struct task_struct *t) } /* Hardware IRQ handlers cannot block, complain if they get here. */ - if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) { + if (in_irq() || in_serving_softirq()) { + lockdep_rcu_suspicious(__FILE__, __LINE__, + "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n"); + pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n", + t->rcu_read_unlock_special.s, + t->rcu_read_unlock_special.b.blocked, + t->rcu_read_unlock_special.b.need_qs); local_irq_restore(flags); return; } -- cgit v1.3-8-gc7d7 From ab6f5bd6741af7b157275de299b7b2b96f2df40e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Jan 2015 16:58:06 -0800 Subject: rcu: Use IS_ENABLED() to simplify rcu_bootup_announce_oddness() This commit gets rid of some inline #ifdefs by replacing them with IS_ENABLED. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 48 ++++++++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 8a33920b8845..81c4d91fa18a 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -58,38 +58,30 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ */ static void __init rcu_bootup_announce_oddness(void) { -#ifdef CONFIG_RCU_TRACE - pr_info("\tRCU debugfs-based tracing is enabled.\n"); -#endif -#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) - pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", - CONFIG_RCU_FANOUT); -#endif -#ifdef CONFIG_RCU_FANOUT_EXACT - pr_info("\tHierarchical RCU autobalancing is disabled.\n"); -#endif -#ifdef CONFIG_RCU_FAST_NO_HZ - pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); -#endif -#ifdef CONFIG_PROVE_RCU - pr_info("\tRCU lockdep checking is enabled.\n"); -#endif -#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE - pr_info("\tRCU torture testing starts during boot.\n"); -#endif -#if defined(CONFIG_RCU_CPU_STALL_INFO) - pr_info("\tAdditional per-CPU info printed with stalls.\n"); -#endif -#if NUM_RCU_LVL_4 != 0 - pr_info("\tFour-level hierarchy is enabled.\n"); -#endif + if (IS_ENABLED(CONFIG_RCU_TRACE)) + pr_info("\tRCU debugfs-based tracing is enabled.\n"); + if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || + (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)) + pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", + CONFIG_RCU_FANOUT); + if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) + pr_info("\tHierarchical RCU autobalancing is disabled.\n"); + if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ)) + pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); + if (IS_ENABLED(CONFIG_PROVE_RCU)) + pr_info("\tRCU lockdep checking is enabled.\n"); + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE)) + pr_info("\tRCU torture testing starts during boot.\n"); + if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO)) + pr_info("\tAdditional per-CPU info printed with stalls.\n"); + if (NUM_RCU_LVL_4 != 0) + pr_info("\tFour-level hierarchy is enabled.\n"); if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); -#ifdef CONFIG_RCU_BOOST - pr_info("\tRCU kthread priority: %d.\n", kthread_prio); -#endif + if (IS_ENABLED(CONFIG_RCU_BOOST)) + pr_info("\tRCU kthread priority: %d.\n", kthread_prio); } #ifdef CONFIG_PREEMPT_RCU -- cgit v1.3-8-gc7d7 From a3bd2c09adcc80946262fd15e63868de1f0f4963 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Jan 2015 20:58:57 -0800 Subject: rcu: Add boot-up check for non-default CONFIG_RCU_FANOUT_LEAF values Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 81c4d91fa18a..c9225350d3ed 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -76,6 +76,9 @@ static void __init rcu_bootup_announce_oddness(void) pr_info("\tAdditional per-CPU info printed with stalls.\n"); if (NUM_RCU_LVL_4 != 0) pr_info("\tFour-level hierarchy is enabled.\n"); + if (CONFIG_RCU_FANOUT_LEAF != 16) + pr_info("\tBuild-time adjustment of leaf fanout to %d.\n", + CONFIG_RCU_FANOUT_LEAF); if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); if (nr_cpu_ids != NR_CPUS) -- cgit v1.3-8-gc7d7 From e7580f33889299e484a80f42c20611ead42f199e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 24 Feb 2015 14:23:39 -0800 Subject: rcu: Get rcu_sched_force_quiescent_state() where it belongs The very similar functions rcu_force_quiescent_state(), rcu_bh_force_quiescent_state(), and rcu_sched_force_quiescent_state() are supposed to be together, but have drifted apart. This commit restores rcu_sched_force_quiescent_state() to its rightful place. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 35e1604f7e3e..fbe9dd9ced54 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -409,6 +409,15 @@ void rcu_bh_force_quiescent_state(void) } EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + /* * Show the state of the grace-period kthreads. */ @@ -482,15 +491,6 @@ void rcutorture_record_progress(unsigned long vernum) } EXPORT_SYMBOL_GPL(rcutorture_record_progress); -/* - * Force a quiescent state for RCU-sched. - */ -void rcu_sched_force_quiescent_state(void) -{ - force_quiescent_state(&rcu_sched_state); -} -EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); - /* * Does the CPU have callbacks ready to be invoked? */ -- cgit v1.3-8-gc7d7 From 9910affa89fe0895153880b115ec243636e70af3 Mon Sep 17 00:00:00 2001 From: Yao Dongdong Date: Wed, 25 Feb 2015 17:09:46 +0800 Subject: rcu: Remove redundant check of cpu_online() Because invoke_cpu_core() checks whether the current CPU is online, there is no need for __call_rcu_core() to redundantly check it. There should not be any performance degradation because the called function is visible to the compiler. This commit therefore removes the redundant check. Signed-off-by: Yao Dongdong Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index fbe9dd9ced54..23194a77a768 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2741,7 +2741,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * If called from an extended quiescent state, invoke the RCU * core in order to force a re-evaluation of RCU's idleness. */ - if (!rcu_is_watching() && cpu_online(smp_processor_id())) + if (!rcu_is_watching()) invoke_rcu_core(); /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ -- cgit v1.3-8-gc7d7 From 17f480342026e54000731acaa69bf32787ce46cb Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 27 Feb 2015 00:07:55 +0100 Subject: genirq / PM: Add flag for shared NO_SUSPEND interrupt lines It currently is required that all users of NO_SUSPEND interrupt lines pass the IRQF_NO_SUSPEND flag when requesting the IRQ or the WARN_ON_ONCE() in irq_pm_install_action() will trigger. That is done to warn about situations in which unprepared interrupt handlers may be run unnecessarily for suspended devices and may attempt to access those devices by mistake. However, it may cause drivers that have no technical reasons for using IRQF_NO_SUSPEND to set that flag just because they happen to share the interrupt line with something like a timer. Moreover, the generic handling of wakeup interrupts introduced by commit 9ce7a25849e8 (genirq: Simplify wakeup mechanism) only works for IRQs without any NO_SUSPEND users, so the drivers of wakeup devices needing to use shared NO_SUSPEND interrupt lines for signaling system wakeup generally have to detect wakeup in their interrupt handlers. Thus if they happen to share an interrupt line with a NO_SUSPEND user, they also need to request that their interrupt handlers be run after suspend_device_irqs(). In both cases the reason for using IRQF_NO_SUSPEND is not because the driver in question has a genuine need to run its interrupt handler after suspend_device_irqs(), but because it happens to share the line with some other NO_SUSPEND user. Otherwise, the driver would do without IRQF_NO_SUSPEND just fine. To make it possible to specify that condition explicitly, introduce a new IRQ action handler flag for shared IRQs, IRQF_COND_SUSPEND, that, when set, will indicate to the IRQ core that the interrupt user is generally fine with suspending the IRQ, but it also can tolerate handler invocations after suspend_device_irqs() and, in particular, it is capable of detecting system wakeup and triggering it as appropriate from its interrupt handler. That will allow us to work around a problem with a shared timer interrupt line on at91 platforms. Link: http://marc.info/?l=linux-kernel&m=142252777602084&w=2 Link: http://marc.info/?t=142252775300011&r=1&w=2 Link: https://lkml.org/lkml/2014/12/15/552 Reported-by: Boris Brezillon Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland --- include/linux/interrupt.h | 5 +++++ include/linux/irqdesc.h | 1 + kernel/irq/manage.c | 7 ++++++- kernel/irq/pm.c | 7 ++++++- 4 files changed, 18 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 606771c7cac2..2e88580194f0 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -59,6 +59,10 @@ * IRQF_NO_THREAD - Interrupt cannot be threaded * IRQF_EARLY_RESUME - Resume IRQ early during syscore instead of at device * resume time. + * IRQF_COND_SUSPEND - If the IRQ is shared with a NO_SUSPEND user, execute this + * interrupt handler after suspending interrupts. For system + * wakeup devices users need to implement wakeup detection in + * their interrupt handlers. */ #define IRQF_DISABLED 0x00000020 #define IRQF_SHARED 0x00000080 @@ -72,6 +76,7 @@ #define IRQF_FORCE_RESUME 0x00008000 #define IRQF_NO_THREAD 0x00010000 #define IRQF_EARLY_RESUME 0x00020000 +#define IRQF_COND_SUSPEND 0x00040000 #define IRQF_TIMER (__IRQF_TIMER | IRQF_NO_SUSPEND | IRQF_NO_THREAD) diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index faf433af425e..dd1109fb241e 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -78,6 +78,7 @@ struct irq_desc { #ifdef CONFIG_PM_SLEEP unsigned int nr_actions; unsigned int no_suspend_depth; + unsigned int cond_suspend_depth; unsigned int force_resume_depth; #endif #ifdef CONFIG_PROC_FS diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 196a06fbc122..886d09e691d5 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1474,8 +1474,13 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, * otherwise we'll have trouble later trying to figure out * which interrupt is which (messes up the interrupt freeing * logic etc). + * + * Also IRQF_COND_SUSPEND only makes sense for shared interrupts and + * it cannot be set along with IRQF_NO_SUSPEND. */ - if ((irqflags & IRQF_SHARED) && !dev_id) + if (((irqflags & IRQF_SHARED) && !dev_id) || + (!(irqflags & IRQF_SHARED) && (irqflags & IRQF_COND_SUSPEND)) || + ((irqflags & IRQF_NO_SUSPEND) && (irqflags & IRQF_COND_SUSPEND))) return -EINVAL; desc = irq_to_desc(irq); diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 3ca532592704..5204a6d1b985 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -43,9 +43,12 @@ void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth++; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth++; WARN_ON_ONCE(desc->no_suspend_depth && - desc->no_suspend_depth != desc->nr_actions); + (desc->no_suspend_depth + + desc->cond_suspend_depth) != desc->nr_actions); } /* @@ -61,6 +64,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) if (action->flags & IRQF_NO_SUSPEND) desc->no_suspend_depth--; + else if (action->flags & IRQF_COND_SUSPEND) + desc->cond_suspend_depth--; } static bool suspend_device_irq(struct irq_desc *desc, int irq) -- cgit v1.3-8-gc7d7 From 2e3ac940f2754d7dc616aba1643a668954fe892f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 3 Mar 2015 17:02:21 -0600 Subject: livepatch: remove unnecessary call to klp_find_object_module() klp_find_object_module() is called from both the klp register and enable paths. Only the call from the register path is necessary because the module notifier will let us know if the patched module gets loaded or unloaded. Signed-off-by: Josh Poimboeuf Reviewed-by: Petr Mladek Signed-off-by: Jiri Kosina --- kernel/livepatch/core.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 26df09d56f7c..d03d6134e824 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -511,8 +511,6 @@ static int __klp_enable_patch(struct klp_patch *patch) pr_notice("enabling patch '%s'\n", patch->mod->name); for (obj = patch->objs; obj->funcs; obj++) { - klp_find_object_module(obj); - if (!klp_is_object_loaded(obj)) continue; -- cgit v1.3-8-gc7d7 From 8603e1b30027f943cc9c1eef2b291d42c3347af1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 5 Mar 2015 08:04:13 -0500 Subject: workqueue: fix hang involving racing cancel[_delayed]_work_sync()'s for PREEMPT_NONE cancel[_delayed]_work_sync() are implemented using __cancel_work_timer() which grabs the PENDING bit using try_to_grab_pending() and then flushes the work item with PENDING set to prevent the on-going execution of the work item from requeueing itself. try_to_grab_pending() can always grab PENDING bit without blocking except when someone else is doing the above flushing during cancelation. In that case, try_to_grab_pending() returns -ENOENT. In this case, __cancel_work_timer() currently invokes flush_work(). The assumption is that the completion of the work item is what the other canceling task would be waiting for too and thus waiting for the same condition and retrying should allow forward progress without excessive busy looping Unfortunately, this doesn't work if preemption is disabled or the latter task has real time priority. Let's say task A just got woken up from flush_work() by the completion of the target work item. If, before task A starts executing, task B gets scheduled and invokes __cancel_work_timer() on the same work item, its try_to_grab_pending() will return -ENOENT as the work item is still being canceled by task A and flush_work() will also immediately return false as the work item is no longer executing. This puts task B in a busy loop possibly preventing task A from executing and clearing the canceling state on the work item leading to a hang. task A task B worker executing work __cancel_work_timer() try_to_grab_pending() set work CANCELING flush_work() block for work completion completion, wakes up A __cancel_work_timer() while (forever) { try_to_grab_pending() -ENOENT as work is being canceled flush_work() false as work is no longer executing } This patch removes the possible hang by updating __cancel_work_timer() to explicitly wait for clearing of CANCELING rather than invoking flush_work() after try_to_grab_pending() fails with -ENOENT. Link: http://lkml.kernel.org/g/20150206171156.GA8942@axis.com v3: bit_waitqueue() can't be used for work items defined in vmalloc area. Switched to custom wake function which matches the target work item and exclusive wait and wakeup. v2: v1 used wake_up() on bit_waitqueue() which leads to NULL deref if the target bit waitqueue has wait_bit_queue's on it. Use DEFINE_WAIT_BIT() and __wake_up_bit() instead. Reported by Tomeu Vizoso. Signed-off-by: Tejun Heo Reported-by: Rabin Vincent Cc: Tomeu Vizoso Cc: stable@vger.kernel.org Tested-by: Jesper Nilsson Tested-by: Rabin Vincent --- include/linux/workqueue.h | 3 ++- kernel/workqueue.c | 56 +++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 54 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 74db135f9957..f597846ff605 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -70,7 +70,8 @@ enum { /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, - WORK_OFFQ_CANCELING = (1 << WORK_OFFQ_FLAG_BASE), + __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, + WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f28849394791..41ff75b478c6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2728,19 +2728,57 @@ bool flush_work(struct work_struct *work) } EXPORT_SYMBOL_GPL(flush_work); +struct cwt_wait { + wait_queue_t wait; + struct work_struct *work; +}; + +static int cwt_wakefn(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct cwt_wait *cwait = container_of(wait, struct cwt_wait, wait); + + if (cwait->work != key) + return 0; + return autoremove_wake_function(wait, mode, sync, key); +} + static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) { + static DECLARE_WAIT_QUEUE_HEAD(cancel_waitq); unsigned long flags; int ret; do { ret = try_to_grab_pending(work, is_dwork, &flags); /* - * If someone else is canceling, wait for the same event it - * would be waiting for before retrying. + * If someone else is already canceling, wait for it to + * finish. flush_work() doesn't work for PREEMPT_NONE + * because we may get scheduled between @work's completion + * and the other canceling task resuming and clearing + * CANCELING - flush_work() will return false immediately + * as @work is no longer busy, try_to_grab_pending() will + * return -ENOENT as @work is still being canceled and the + * other canceling task won't be able to clear CANCELING as + * we're hogging the CPU. + * + * Let's wait for completion using a waitqueue. As this + * may lead to the thundering herd problem, use a custom + * wake function which matches @work along with exclusive + * wait and wakeup. */ - if (unlikely(ret == -ENOENT)) - flush_work(work); + if (unlikely(ret == -ENOENT)) { + struct cwt_wait cwait; + + init_wait(&cwait.wait); + cwait.wait.func = cwt_wakefn; + cwait.work = work; + + prepare_to_wait_exclusive(&cancel_waitq, &cwait.wait, + TASK_UNINTERRUPTIBLE); + if (work_is_canceling(work)) + schedule(); + finish_wait(&cancel_waitq, &cwait.wait); + } } while (unlikely(ret < 0)); /* tell other tasks trying to grab @work to back off */ @@ -2749,6 +2787,16 @@ static bool __cancel_work_timer(struct work_struct *work, bool is_dwork) flush_work(work); clear_work_data(work); + + /* + * Paired with prepare_to_wait() above so that either + * waitqueue_active() is visible here or !work_is_canceling() is + * visible there. + */ + smp_mb(); + if (waitqueue_active(&cancel_waitq)) + __wake_up(&cancel_waitq, TASK_NORMAL, 1, work); + return ret; } -- cgit v1.3-8-gc7d7 From ef2b22ac540c018bd574d1846ab95b9bfcf38702 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 2 Mar 2015 22:26:55 +0100 Subject: cpuidle / sleep: Use broadcast timer for states that stop local timer Commit 381063133246 (PM / sleep: Re-implement suspend-to-idle handling) overlooked the fact that entering some sufficiently deep idle states by CPUs may cause their local timers to stop and in those cases it is necessary to switch over to a broadcast timer prior to entering the idle state. If the cpuidle driver in use does not provide the new ->enter_freeze callback for any of the idle states, that problem affects suspend-to-idle too, but it is not taken into account after the changes made by commit 381063133246. Fix that by changing the definition of cpuidle_enter_freeze() and re-arranging of the code in cpuidle_idle_call(), so the former does not call cpuidle_enter() any more and the fallback case is handled by cpuidle_idle_call() directly. Fixes: 381063133246 (PM / sleep: Re-implement suspend-to-idle handling) Reported-and-tested-by: Lorenzo Pieralisi Signed-off-by: Rafael J. Wysocki Acked-by: Peter Zijlstra (Intel) --- drivers/cpuidle/cpuidle.c | 62 +++++++++++++++++------------------------------ include/linux/cpuidle.h | 17 +++++++++++-- kernel/sched/idle.c | 30 ++++++++++++++++------- 3 files changed, 58 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8b3e132b6a01..080bd2dbde4b 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -44,8 +44,8 @@ void disable_cpuidle(void) off = 1; } -static bool cpuidle_not_available(struct cpuidle_driver *drv, - struct cpuidle_device *dev) +bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev) { return off || !initialized || !drv || !dev || !dev->enabled; } @@ -72,14 +72,8 @@ int cpuidle_play_dead(void) return -ENODEV; } -/** - * cpuidle_find_deepest_state - Find deepest state meeting specific conditions. - * @drv: cpuidle driver for the given CPU. - * @dev: cpuidle device for the given CPU. - * @freeze: Whether or not the state should be suitable for suspend-to-idle. - */ -static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, - struct cpuidle_device *dev, bool freeze) +static int find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev, bool freeze) { unsigned int latency_req = 0; int i, ret = freeze ? -1 : CPUIDLE_DRIVER_STATE_START - 1; @@ -98,6 +92,17 @@ static int cpuidle_find_deepest_state(struct cpuidle_driver *drv, return ret; } +/** + * cpuidle_find_deepest_state - Find the deepest available idle state. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. + */ +int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{ + return find_deepest_state(drv, dev, false); +} + static void enter_freeze_proper(struct cpuidle_driver *drv, struct cpuidle_device *dev, int index) { @@ -119,46 +124,26 @@ static void enter_freeze_proper(struct cpuidle_driver *drv, /** * cpuidle_enter_freeze - Enter an idle state suitable for suspend-to-idle. + * @drv: cpuidle driver for the given CPU. + * @dev: cpuidle device for the given CPU. * * If there are states with the ->enter_freeze callback, find the deepest of - * them and enter it with frozen tick. Otherwise, find the deepest state - * available and enter it normally. - * - * Returns with enabled interrupts. + * them and enter it with frozen tick. */ -void cpuidle_enter_freeze(void) +int cpuidle_enter_freeze(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - struct cpuidle_device *dev = __this_cpu_read(cpuidle_devices); - struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int index; - if (cpuidle_not_available(drv, dev)) - goto fallback; - /* * Find the deepest state with ->enter_freeze present, which guarantees * that interrupts won't be enabled when it exits and allows the tick to * be frozen safely. */ - index = cpuidle_find_deepest_state(drv, dev, true); - if (index >= 0) { + index = find_deepest_state(drv, dev, true); + if (index >= 0) enter_freeze_proper(drv, dev, index); - local_irq_enable(); - return; - } - /* - * It is not safe to freeze the tick, find the deepest state available - * at all and try to enter it normally. - */ - index = cpuidle_find_deepest_state(drv, dev, false); - if (index >= 0) { - cpuidle_enter(drv, dev, index); - return; - } - - fallback: - arch_cpu_idle(); + return index; } /** @@ -217,9 +202,6 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv, */ int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) { - if (cpuidle_not_available(drv, dev)) - return -ENODEV; - return cpuidle_curr_governor->select(drv, dev); } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index f551a9299ac9..306178d7309f 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -126,6 +126,8 @@ struct cpuidle_driver { #ifdef CONFIG_CPU_IDLE extern void disable_cpuidle(void); +extern bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev); extern int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev); @@ -150,11 +152,17 @@ extern void cpuidle_resume(void); extern int cpuidle_enable_device(struct cpuidle_device *dev); extern void cpuidle_disable_device(struct cpuidle_device *dev); extern int cpuidle_play_dead(void); -extern void cpuidle_enter_freeze(void); +extern int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev); +extern int cpuidle_enter_freeze(struct cpuidle_driver *drv, + struct cpuidle_device *dev); extern struct cpuidle_driver *cpuidle_get_cpu_driver(struct cpuidle_device *dev); #else static inline void disable_cpuidle(void) { } +static inline bool cpuidle_not_available(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return true; } static inline int cpuidle_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) {return -ENODEV; } @@ -183,7 +191,12 @@ static inline int cpuidle_enable_device(struct cpuidle_device *dev) {return -ENODEV; } static inline void cpuidle_disable_device(struct cpuidle_device *dev) { } static inline int cpuidle_play_dead(void) {return -ENODEV; } -static inline void cpuidle_enter_freeze(void) { } +static inline int cpuidle_find_deepest_state(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return -ENODEV; } +static inline int cpuidle_enter_freeze(struct cpuidle_driver *drv, + struct cpuidle_device *dev) +{return -ENODEV; } static inline struct cpuidle_driver *cpuidle_get_cpu_driver( struct cpuidle_device *dev) {return NULL; } #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 84b93b68482a..80014a178342 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -82,6 +82,7 @@ static void cpuidle_idle_call(void) struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); int next_state, entered_state; unsigned int broadcast; + bool reflect; /* * Check if the idle task must be rescheduled. If it is the @@ -105,6 +106,9 @@ static void cpuidle_idle_call(void) */ rcu_idle_enter(); + if (cpuidle_not_available(drv, dev)) + goto use_default; + /* * Suspend-to-idle ("freeze") is a system state in which all user space * has been frozen, all I/O devices have been suspended and the only @@ -115,15 +119,22 @@ static void cpuidle_idle_call(void) * until a proper wakeup interrupt happens. */ if (idle_should_freeze()) { - cpuidle_enter_freeze(); - goto exit_idle; - } + entered_state = cpuidle_enter_freeze(drv, dev); + if (entered_state >= 0) { + local_irq_enable(); + goto exit_idle; + } - /* - * Ask the cpuidle framework to choose a convenient idle state. - * Fall back to the default arch idle method on errors. - */ - next_state = cpuidle_select(drv, dev); + reflect = false; + next_state = cpuidle_find_deepest_state(drv, dev); + } else { + reflect = true; + /* + * Ask the cpuidle framework to choose a convenient idle state. + */ + next_state = cpuidle_select(drv, dev); + } + /* Fall back to the default arch idle method on errors. */ if (next_state < 0) goto use_default; @@ -170,7 +181,8 @@ static void cpuidle_idle_call(void) /* * Give the governor an opportunity to reflect on the outcome */ - cpuidle_reflect(dev, entered_state); + if (reflect) + cpuidle_reflect(dev, entered_state); exit_idle: __current_set_polling(); -- cgit v1.3-8-gc7d7 From 168e47f2a6581fdbc5bb1845aeca1e50e2bc5c4b Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Wed, 25 Feb 2015 14:14:57 -0800 Subject: kernel/module.c: Update debug alignment after symtable generation When CONFIG_DEBUG_SET_MODULE_RONX is enabled, the sizes of module sections are aligned up so appropriate permissions can be applied. Adjusting for the symbol table may cause them to become unaligned. Make sure to re-align the sizes afterward. Signed-off-by: Laura Abbott Acked-by: Rusty Russell Signed-off-by: Catalin Marinas --- kernel/module.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index b34813f725e9..cc93cf68653c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2313,11 +2313,13 @@ static void layout_symtab(struct module *mod, struct load_info *info) info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); mod->core_size += strtab_size; + mod->core_size = debug_align(mod->core_size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, info->index.str) | INIT_OFFSET_MASK; + mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } -- cgit v1.3-8-gc7d7 From 30a22c215a0007603ffc08021f2e8b64018517dd Mon Sep 17 00:00:00 2001 From: Peter Hurley Date: Sun, 1 Mar 2015 10:11:05 -0500 Subject: console: Fix console name size mismatch commit 6ae9200f2cab7 ("enlarge console.name") increased the storage for the console name to 16 bytes, but not the corresponding struct console_cmdline::name storage. Console names longer than 8 bytes cause read beyond end-of-string and failure to match console; I'm not sure if there are other unexpected consequences. Cc: # 2.6.22+ Signed-off-by: Peter Hurley Signed-off-by: Greg Kroah-Hartman --- kernel/printk/console_cmdline.h | 2 +- kernel/printk/printk.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h index cbd69d842341..2ca4a8b5fe57 100644 --- a/kernel/printk/console_cmdline.h +++ b/kernel/printk/console_cmdline.h @@ -3,7 +3,7 @@ struct console_cmdline { - char name[8]; /* Name of the driver */ + char name[16]; /* Name of the driver */ int index; /* Minor dev. to use */ char *options; /* Options for the driver */ #ifdef CONFIG_A11Y_BRAILLE_CONSOLE diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 01cfd69c54c6..bb0635bd74f2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2464,6 +2464,7 @@ void register_console(struct console *newcon) for (i = 0, c = console_cmdline; i < MAX_CMDLINECONSOLES && c->name[0]; i++, c++) { + BUILD_BUG_ON(sizeof(c->name) != sizeof(newcon->name)); if (strcmp(c->name, newcon->name) != 0) continue; if (newcon->index >= 0 && -- cgit v1.3-8-gc7d7 From 3ba67dabaa58e3223325f0a813a6e830fb5f5cc5 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 5 Mar 2015 23:27:51 +0100 Subject: ebpf: bpf_map_*: fix linker error on avr32 and openrisc arch Fengguang reported, that on openrisc and avr32 architectures, we get the following linker errors on *_defconfig builds that have no bpf syscall support: net/built-in.o:(.rodata+0x1cd0): undefined reference to `bpf_map_lookup_elem_proto' net/built-in.o:(.rodata+0x1cd4): undefined reference to `bpf_map_update_elem_proto' net/built-in.o:(.rodata+0x1cd8): undefined reference to `bpf_map_delete_elem_proto' Fix it up by providing built-in weak definitions of the symbols, so they can be overridden when the syscall is enabled. I think the issue might be that gcc is not able to optimize all that away. This patch fixes the linker errors for me, tested with Fengguang's make.cross [1] script. [1] https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp-tests.git/plain/sbin/make.cross Reported-by: Fengguang Wu Fixes: d4052c4aea0c ("ebpf: remove CONFIG_BPF_SYSCALL ifdefs in socket filter code") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- kernel/bpf/core.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index a64e7a207d2b..50603aec766a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -656,6 +656,11 @@ void bpf_prog_free(struct bpf_prog *fp) } EXPORT_SYMBOL_GPL(bpf_prog_free); +/* Weak definitions of helper functions in case we don't have bpf syscall. */ +const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; +const struct bpf_func_proto bpf_map_update_elem_proto __weak; +const struct bpf_func_proto bpf_map_delete_elem_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ -- cgit v1.3-8-gc7d7 From 9198f6edfd9ced74fd90b238d5a354aeac89bdfa Mon Sep 17 00:00:00 2001 From: Jason Low Date: Fri, 6 Mar 2015 23:45:31 -0800 Subject: locking/rwsem: Fix lock optimistic spinning when owner is not running Ming reported soft lockups occurring when running xfstest due to the following tip:locking/core commit: b3fd4f03ca0b ("locking/rwsem: Avoid deceiving lock spinners") When doing optimistic spinning in rwsem, threads should stop spinning when the lock owner is not running. While a thread is spinning on owner, if the owner reschedules, owner->on_cpu returns false and we stop spinning. However, this commit essentially caused the check to get ignored because when we break out of the spin loop due to !on_cpu, we continue spinning if sem->owner != NULL. This patch fixes this by making sure we stop spinning if the owner is not running. Furthermore, just like with mutexes, refactor the code such that we don't have separate checks for owner_running(). This makes it more straightforward in terms of why we exit the spin on owner loop and we would also avoid needing to "guess" why we broke out of the loop to make this more readable. Reported-and-tested-by: Ming Lei Signed-off-by: Jason Low Acked-by: Davidlohr Bueso Cc: Andrew Morton Cc: Dave Jones Cc: Linus Torvalds Cc: Michel Lespinasse Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Cc: Tim Chen Link: http://lkml.kernel.org/r/1425714331.2475.388.camel@j-VirtualBox Signed-off-by: Ingo Molnar --- kernel/locking/rwsem-xadd.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 06e2214edf98..3417d0172a5d 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -324,32 +324,23 @@ done: return ret; } -static inline bool owner_running(struct rw_semaphore *sem, - struct task_struct *owner) -{ - if (sem->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * sem->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner) { long count; rcu_read_lock(); - while (owner_running(sem, owner)) { - /* abort spinning when need_resched */ - if (need_resched()) { + while (sem->owner == owner) { + /* + * Ensure we emit the owner->on_cpu, dereference _after_ + * checking sem->owner still matches owner, if that fails, + * owner might point to free()d memory, if it still matches, + * the rcu_read_lock() ensures the memory stays valid. + */ + barrier(); + + /* abort spinning when need_resched or owner is not running */ + if (!owner->on_cpu || need_resched()) { rcu_read_unlock(); return false; } -- cgit v1.3-8-gc7d7 From e2dca7adff8f3fae0ab250a6362798550b3c79ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Mar 2015 09:22:28 -0400 Subject: workqueue: make the workqueues list RCU walkable The workqueues list is protected by wq_pool_mutex and a workqueue and its subordinate data structures are freed directly on destruction. We want to add the ability dump workqueues from a sysrq callback which requires walking all workqueues without grabbing wq_pool_mutex. This patch makes freeing of workqueues RCU protected and makes the workqueues list walkable while holding RCU read lock. Note that pool_workqueues and pools are already sched-RCU protected. For consistency, workqueues are also protected with sched-RCU. While at it, reverse the workqueues list so that a workqueue which is created earlier comes before. The order of the list isn't significant functionally but this makes the planned sysrq dump list system workqueues first. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 47 +++++++++++++++++++++++++++++++---------------- 1 file changed, 31 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 41ff75b478c6..6b9b0dc3dea5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -230,7 +230,7 @@ struct wq_device; */ struct workqueue_struct { struct list_head pwqs; /* WR: all pwqs of this wq */ - struct list_head list; /* PL: list of all workqueues */ + struct list_head list; /* PR: list of all workqueues */ struct mutex mutex; /* protects this wq */ int work_color; /* WQ: current work color */ @@ -257,6 +257,13 @@ struct workqueue_struct { #endif char name[WQ_NAME_LEN]; /* I: workqueue name */ + /* + * Destruction of workqueue_struct is sched-RCU protected to allow + * walking the workqueues list without grabbing wq_pool_mutex. + * This is used to dump all workqueues from sysrq. + */ + struct rcu_head rcu; + /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ @@ -288,7 +295,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ -static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* the per-cpu worker pools */ @@ -3424,6 +3431,20 @@ static int init_worker_pool(struct worker_pool *pool) return 0; } +static void rcu_free_wq(struct rcu_head *rcu) +{ + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); + + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); + else + free_workqueue_attrs(wq->unbound_attrs); + + kfree(wq->rescuer); + kfree(wq); +} + static void rcu_free_pool(struct rcu_head *rcu) { struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); @@ -3601,12 +3622,10 @@ static void pwq_unbound_release_workfn(struct work_struct *work) /* * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Free it. + * is gonna access it anymore. Schedule RCU free. */ - if (is_last) { - free_workqueue_attrs(wq->unbound_attrs); - kfree(wq); - } + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** @@ -4143,7 +4162,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, pwq_adjust_max_active(pwq); mutex_unlock(&wq->mutex); - list_add(&wq->list, &workqueues); + list_add_tail_rcu(&wq->list, &workqueues); mutex_unlock(&wq_pool_mutex); @@ -4199,24 +4218,20 @@ void destroy_workqueue(struct workqueue_struct *wq) * flushing is complete in case freeze races us. */ mutex_lock(&wq_pool_mutex); - list_del_init(&wq->list); + list_del_rcu(&wq->list); mutex_unlock(&wq_pool_mutex); workqueue_sysfs_unregister(wq); - if (wq->rescuer) { + if (wq->rescuer) kthread_stop(wq->rescuer->task); - kfree(wq->rescuer); - wq->rescuer = NULL; - } if (!(wq->flags & WQ_UNBOUND)) { /* * The base ref is never dropped on per-cpu pwqs. Directly - * free the pwqs and wq. + * schedule RCU free. */ - free_percpu(wq->cpu_pwqs); - kfree(wq); + call_rcu_sched(&wq->rcu, rcu_free_wq); } else { /* * We're the sole accessor of @wq at this point. Directly -- cgit v1.3-8-gc7d7 From 2607d7a6dba1e790aaacb14600ceffa3aa2f43e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Mar 2015 09:22:28 -0400 Subject: workqueue: keep track of the flushing task and pool manager Add wq_barrier->task and worker_pool->manager to keep track of the flushing task and pool manager respectively. These are purely informational and will be used to implement sysrq dump of workqueues. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6b9b0dc3dea5..0c329a6f0c51 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -159,6 +159,7 @@ struct worker_pool { /* see manage_workers() for details on the two manager mutexes */ struct mutex manager_arb; /* manager arbitration */ + struct worker *manager; /* L: purely informational */ struct mutex attach_mutex; /* attach/detach exclusion */ struct list_head workers; /* A: attached workers */ struct completion *detach_completion; /* all workers detached */ @@ -1918,9 +1919,11 @@ static bool manage_workers(struct worker *worker) */ if (!mutex_trylock(&pool->manager_arb)) return false; + pool->manager = worker; maybe_create_worker(pool); + pool->manager = NULL; mutex_unlock(&pool->manager_arb); return true; } @@ -2310,6 +2313,7 @@ repeat: struct wq_barrier { struct work_struct work; struct completion done; + struct task_struct *task; /* purely informational */ }; static void wq_barrier_func(struct work_struct *work) @@ -2358,6 +2362,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, INIT_WORK_ONSTACK(&barr->work, wq_barrier_func); __set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work)); init_completion(&barr->done); + barr->task = current; /* * If @target is currently being executed, schedule the -- cgit v1.3-8-gc7d7 From 3494fc30846dceb808de4cc02930ef347fabd21a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 9 Mar 2015 09:22:28 -0400 Subject: workqueue: dump workqueues on sysrq-t Workqueues are used extensively throughout the kernel but sometimes it's difficult to debug stalls involving work items because visibility into its inner workings is fairly limited. Although sysrq-t task dump annotates each active worker task with the information on the work item being executed, it is challenging to find out which work items are pending or delayed on which queues and how pools are being managed. This patch implements show_workqueue_state() which dumps all busy workqueues and pools and is called from the sysrq-t handler. At the end of sysrq-t dump, something like the following is printed. Showing busy workqueues and worker pools: ... workqueue filler_wq: flags=0x0 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=2/256 in-flight: 491:filler_workfn, 507:filler_workfn pwq 0: cpus=0 node=0 flags=0x0 nice=0 active=2/256 in-flight: 501:filler_workfn pending: filler_workfn ... workqueue test_wq: flags=0x8 pwq 2: cpus=1 node=0 flags=0x0 nice=0 active=1/1 in-flight: 510(RESCUER):test_workfn BAR(69) BAR(500) delayed: test_workfn1 BAR(492), test_workfn2 ... pool 0: cpus=0 node=0 flags=0x0 nice=0 workers=2 manager: 137 pool 2: cpus=1 node=0 flags=0x0 nice=0 workers=3 manager: 469 pool 3: cpus=1 node=0 flags=0x0 nice=-20 workers=2 idle: 16 pool 8: cpus=0-3 flags=0x4 nice=0 workers=2 manager: 62 The above shows that test_wq is executing test_workfn() on pid 510 which is the rescuer and also that there are two tasks 69 and 500 waiting for the work item to finish in flush_work(). As test_wq has max_active of 1, there are two work items for test_workfn1() and test_workfn2() which are delayed till the current work item is finished. In addition, pid 492 is flushing test_workfn1(). The work item for test_workfn() is being executed on pwq of pool 2 which is the normal priority per-cpu pool for CPU 1. The pool has three workers, two of which are executing filler_workfn() for filler_wq and the last one is assuming the manager role trying to create more workers. This extra workqueue state dump will hopefully help chasing down hangs involving workqueues. v3: cpulist_pr_cont() replaced with "%*pbl" printf formatting. v2: As suggested by Andrew, minor formatting change in pr_cont_work(), printk()'s replaced with pr_info()'s, and cpumask printing now uses cpulist_pr_cont(). Signed-off-by: Tejun Heo Cc: Lai Jiangshan Cc: Linus Torvalds Cc: Andrew Morton CC: Ingo Molnar --- drivers/tty/sysrq.c | 1 + include/linux/workqueue.h | 1 + kernel/workqueue.c | 160 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 162 insertions(+) (limited to 'kernel') diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index 259a4d5a4e8f..843f2cdc280b 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -275,6 +275,7 @@ static struct sysrq_key_op sysrq_showregs_op = { static void sysrq_handle_showstate(int key) { show_state(); + show_workqueue_state(); } static struct sysrq_key_op sysrq_showstate_op = { .handler = sysrq_handle_showstate, diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index f597846ff605..deee212af8e0 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -454,6 +454,7 @@ extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); +extern void show_workqueue_state(void); /** * queue_work - queue work on a workqueue diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 0c329a6f0c51..1ca0b1d54e70 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4457,6 +4457,166 @@ void print_worker_info(const char *log_lvl, struct task_struct *task) } } +static void pr_cont_pool_info(struct worker_pool *pool) +{ + pr_cont(" cpus=%*pbl", nr_cpumask_bits, pool->attrs->cpumask); + if (pool->node != NUMA_NO_NODE) + pr_cont(" node=%d", pool->node); + pr_cont(" flags=0x%x nice=%d", pool->flags, pool->attrs->nice); +} + +static void pr_cont_work(bool comma, struct work_struct *work) +{ + if (work->func == wq_barrier_func) { + struct wq_barrier *barr; + + barr = container_of(work, struct wq_barrier, work); + + pr_cont("%s BAR(%d)", comma ? "," : "", + task_pid_nr(barr->task)); + } else { + pr_cont("%s %pf", comma ? "," : "", work->func); + } +} + +static void show_pwq(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct work_struct *work; + struct worker *worker; + bool has_in_flight = false, has_pending = false; + int bkt; + + pr_info(" pwq %d:", pool->id); + pr_cont_pool_info(pool); + + pr_cont(" active=%d/%d%s\n", pwq->nr_active, pwq->max_active, + !list_empty(&pwq->mayday_node) ? " MAYDAY" : ""); + + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq == pwq) { + has_in_flight = true; + break; + } + } + if (has_in_flight) { + bool comma = false; + + pr_info(" in-flight:"); + hash_for_each(pool->busy_hash, bkt, worker, hentry) { + if (worker->current_pwq != pwq) + continue; + + pr_cont("%s %d%s:%pf", comma ? "," : "", + task_pid_nr(worker->task), + worker == pwq->wq->rescuer ? "(RESCUER)" : "", + worker->current_func); + list_for_each_entry(work, &worker->scheduled, entry) + pr_cont_work(false, work); + comma = true; + } + pr_cont("\n"); + } + + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + has_pending = true; + break; + } + } + if (has_pending) { + bool comma = false; + + pr_info(" pending:"); + list_for_each_entry(work, &pool->worklist, entry) { + if (get_work_pwq(work) != pwq) + continue; + + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } + + if (!list_empty(&pwq->delayed_works)) { + bool comma = false; + + pr_info(" delayed:"); + list_for_each_entry(work, &pwq->delayed_works, entry) { + pr_cont_work(comma, work); + comma = !(*work_data_bits(work) & WORK_STRUCT_LINKED); + } + pr_cont("\n"); + } +} + +/** + * show_workqueue_state - dump workqueue state + * + * Called from a sysrq handler and prints out all busy workqueues and + * pools. + */ +void show_workqueue_state(void) +{ + struct workqueue_struct *wq; + struct worker_pool *pool; + unsigned long flags; + int pi; + + rcu_read_lock_sched(); + + pr_info("Showing busy workqueues and worker pools:\n"); + + list_for_each_entry_rcu(wq, &workqueues, list) { + struct pool_workqueue *pwq; + bool idle = true; + + for_each_pwq(pwq, wq) { + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) { + idle = false; + break; + } + } + if (idle) + continue; + + pr_info("workqueue %s: flags=0x%x\n", wq->name, wq->flags); + + for_each_pwq(pwq, wq) { + spin_lock_irqsave(&pwq->pool->lock, flags); + if (pwq->nr_active || !list_empty(&pwq->delayed_works)) + show_pwq(pwq); + spin_unlock_irqrestore(&pwq->pool->lock, flags); + } + } + + for_each_pool(pool, pi) { + struct worker *worker; + bool first = true; + + spin_lock_irqsave(&pool->lock, flags); + if (pool->nr_workers == pool->nr_idle) + goto next_pool; + + pr_info("pool %d:", pool->id); + pr_cont_pool_info(pool); + pr_cont(" workers=%d", pool->nr_workers); + if (pool->manager) + pr_cont(" manager: %d", + task_pid_nr(pool->manager->task)); + list_for_each_entry(worker, &pool->idle_list, entry) { + pr_cont(" %s%d", first ? "idle: " : "", + task_pid_nr(worker->task)); + first = false; + } + pr_cont("\n"); + next_pool: + spin_unlock_irqrestore(&pool->lock, flags); + } + + rcu_read_unlock_sched(); +} + /* * CPU hotplug. * -- cgit v1.3-8-gc7d7 From c467ea763fd5d8795b7d1b5a78eb94b6ad8f66ad Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 4 Mar 2015 18:06:33 +0100 Subject: context_tracking: Rename context symbols to prepare for transition state Current context tracking symbols are designed to express living state. As such they are prefixed with "IN_": IN_USER, IN_KERNEL. Now we are going to use these symbols to also express state transitions such as context_tracking_enter(IN_USER) or context_tracking_exit(IN_USER). But while the "IN_" prefix works well to express entering a context, it's confusing to depict a context exit: context_tracking_exit(IN_USER) could mean two things: 1) We are exiting the current context to enter user context. 2) We are exiting the user context We want 2) but the reviewer may be confused and understand 1) So lets disambiguate these symbols and rename them to CONTEXT_USER and CONTEXT_KERNEL. Acked-by: Rik van Riel Cc: Paul E. McKenney Cc: Andy Lutomirski Cc: Will deacon Cc: Marcelo Tosatti Cc: Christian Borntraeger Cc: Luiz Capitulino Cc: Paolo Bonzini Signed-off-by: Frederic Weisbecker --- arch/x86/kernel/traps.c | 2 +- include/linux/context_tracking.h | 2 +- include/linux/context_tracking_state.h | 6 +++--- kernel/context_tracking.c | 8 ++++---- kernel/sched/core.c | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9d2073e2ecc9..756f74eed35d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) * but we need to notify RCU. */ rcu_nmi_enter(); - prev_state = IN_KERNEL; /* the value is irrelevant. */ + prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ } /* diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 37b81bd51ec0..427b056dfd3d 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -43,7 +43,7 @@ static inline enum ctx_state exception_enter(void) static inline void exception_exit(enum ctx_state prev_ctx) { if (context_tracking_is_enabled()) { - if (prev_ctx == IN_USER) + if (prev_ctx == CONTEXT_USER) context_tracking_user_enter(); } } diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 97a81225d037..8076f875c324 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -13,8 +13,8 @@ struct context_tracking { */ bool active; enum ctx_state { - IN_KERNEL = 0, - IN_USER, + CONTEXT_KERNEL = 0, + CONTEXT_USER, } state; }; @@ -34,7 +34,7 @@ static inline bool context_tracking_cpu_is_enabled(void) static inline bool context_tracking_in_user(void) { - return __this_cpu_read(context_tracking.state) == IN_USER; + return __this_cpu_read(context_tracking.state) == CONTEXT_USER; } #else static inline bool context_tracking_in_user(void) { return false; } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 937ecdfdf258..8ad53c9d38b6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -75,7 +75,7 @@ void context_tracking_user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if ( __this_cpu_read(context_tracking.state) != IN_USER) { + if ( __this_cpu_read(context_tracking.state) != CONTEXT_USER) { if (__this_cpu_read(context_tracking.active)) { trace_user_enter(0); /* @@ -101,7 +101,7 @@ void context_tracking_user_enter(void) * OTOH we can spare the calls to vtime and RCU when context_tracking.active * is false because we know that CPU is not tickless. */ - __this_cpu_write(context_tracking.state, IN_USER); + __this_cpu_write(context_tracking.state, CONTEXT_USER); } local_irq_restore(flags); } @@ -129,7 +129,7 @@ void context_tracking_user_exit(void) return; local_irq_save(flags); - if (__this_cpu_read(context_tracking.state) == IN_USER) { + if (__this_cpu_read(context_tracking.state) == CONTEXT_USER) { if (__this_cpu_read(context_tracking.active)) { /* * We are going to run code that may use RCU. Inform @@ -139,7 +139,7 @@ void context_tracking_user_exit(void) vtime_user_exit(current); trace_user_exit(0); } - __this_cpu_write(context_tracking.state, IN_KERNEL); + __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } local_irq_restore(flags); } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..06b9a00871e0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2818,7 +2818,7 @@ asmlinkage __visible void __sched schedule_user(void) * we find a better solution. * * NB: There are buggy callers of this function. Ideally we - * should warn if prev_state != IN_USER, but that will trigger + * should warn if prev_state != CONTEXT_USER, but that will trigger * too frequently to make sense yet. */ enum ctx_state prev_state = exception_enter(); -- cgit v1.3-8-gc7d7 From 3aab4f50bff89bdea5066a05d4f3c5fa25bc37c7 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 10 Feb 2015 15:27:50 -0500 Subject: context_tracking: Generalize context tracking APIs to support user and guest Generalize the context tracking APIs to support various nature of contexts. This is performed by splitting out the mechanism from context_tracking_user_enter and context_tracking_user_exit into context_tracking_enter and context_tracking_exit. The nature of the context we track is now detailed in a ctx_state parameter pushed to these APIs, allowing the same functions to not just track kernel <> user space switching, but also kernel <> guest transitions. But leave the old functions in order to avoid breaking ARM, which calls these functions from assembler code, and cannot easily use C enum parameters. Reviewed-by: Paul E. McKenney Signed-off-by: Rik van Riel Cc: Paul E. McKenney Cc: Andy Lutomirski Cc: Will deacon Cc: Marcelo Tosatti Cc: Christian Borntraeger Cc: Luiz Capitulino Cc: Paolo Bonzini Signed-off-by: Frederic Weisbecker --- include/linux/context_tracking.h | 9 ++++++--- kernel/context_tracking.c | 43 ++++++++++++++++++++++++++-------------- 2 files changed, 34 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 427b056dfd3d..7f1810a3b5a4 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -10,6 +10,8 @@ #ifdef CONFIG_CONTEXT_TRACKING extern void context_tracking_cpu_set(int cpu); +extern void context_tracking_enter(enum ctx_state state); +extern void context_tracking_exit(enum ctx_state state); extern void context_tracking_user_enter(void); extern void context_tracking_user_exit(void); extern void __context_tracking_task_switch(struct task_struct *prev, @@ -35,7 +37,8 @@ static inline enum ctx_state exception_enter(void) return 0; prev_ctx = this_cpu_read(context_tracking.state); - context_tracking_user_exit(); + if (prev_ctx != CONTEXT_KERNEL) + context_tracking_exit(prev_ctx); return prev_ctx; } @@ -43,8 +46,8 @@ static inline enum ctx_state exception_enter(void) static inline void exception_exit(enum ctx_state prev_ctx) { if (context_tracking_is_enabled()) { - if (prev_ctx == CONTEXT_USER) - context_tracking_user_enter(); + if (prev_ctx != CONTEXT_KERNEL) + context_tracking_enter(prev_ctx); } } diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 8ad53c9d38b6..17715d811b71 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -39,15 +39,15 @@ void context_tracking_cpu_set(int cpu) } /** - * context_tracking_user_enter - Inform the context tracking that the CPU is going to - * enter userspace mode. + * context_tracking_enter - Inform the context tracking that the CPU is going + * enter user or guest space mode. * * This function must be called right before we switch from the kernel - * to userspace, when it's guaranteed the remaining kernel instructions - * to execute won't use any RCU read side critical section because this - * function sets RCU in extended quiescent state. + * to user or guest space, when it's guaranteed the remaining kernel + * instructions to execute won't use any RCU read side critical section + * because this function sets RCU in extended quiescent state. */ -void context_tracking_user_enter(void) +void context_tracking_enter(enum ctx_state state) { unsigned long flags; @@ -75,7 +75,7 @@ void context_tracking_user_enter(void) WARN_ON_ONCE(!current->mm); local_irq_save(flags); - if ( __this_cpu_read(context_tracking.state) != CONTEXT_USER) { + if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { trace_user_enter(0); /* @@ -101,24 +101,31 @@ void context_tracking_user_enter(void) * OTOH we can spare the calls to vtime and RCU when context_tracking.active * is false because we know that CPU is not tickless. */ - __this_cpu_write(context_tracking.state, CONTEXT_USER); + __this_cpu_write(context_tracking.state, state); } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_enter); + +void context_tracking_user_enter(void) +{ + context_tracking_enter(CONTEXT_USER); +} NOKPROBE_SYMBOL(context_tracking_user_enter); /** - * context_tracking_user_exit - Inform the context tracking that the CPU is - * exiting userspace mode and entering the kernel. + * context_tracking_exit - Inform the context tracking that the CPU is + * exiting user or guest mode and entering the kernel. * - * This function must be called after we entered the kernel from userspace - * before any use of RCU read side critical section. This potentially include - * any high level kernel code like syscalls, exceptions, signal handling, etc... + * This function must be called after we entered the kernel from user or + * guest space before any use of RCU read side critical section. This + * potentially include any high level kernel code like syscalls, exceptions, + * signal handling, etc... * * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_user_exit(void) +void context_tracking_exit(enum ctx_state state) { unsigned long flags; @@ -129,7 +136,7 @@ void context_tracking_user_exit(void) return; local_irq_save(flags); - if (__this_cpu_read(context_tracking.state) == CONTEXT_USER) { + if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { /* * We are going to run code that may use RCU. Inform @@ -143,6 +150,12 @@ void context_tracking_user_exit(void) } local_irq_restore(flags); } +NOKPROBE_SYMBOL(context_tracking_exit); + +void context_tracking_user_exit(void) +{ + context_tracking_exit(CONTEXT_USER); +} NOKPROBE_SYMBOL(context_tracking_user_exit); /** -- cgit v1.3-8-gc7d7 From 19fdd98b6253404c6bdd6927bde9f962729376f7 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 10 Feb 2015 15:27:52 -0500 Subject: context_tracking: Run vtime_user_enter/exit only when state == CONTEXT_USER Only run vtime_user_enter, vtime_user_exit, and the user enter & exit trace points when we are entering or exiting user state, respectively. The KVM code in guest_enter and guest_exit already take care of calling vtime_guest_enter and vtime_guest_exit, respectively. The RCU code only distinguishes between "idle" and "not idle or kernel". There should be no need to add an additional (unused) state there. Reviewed-by: Paul E. McKenney Signed-off-by: Rik van Riel Cc: Paul E. McKenney Cc: Andy Lutomirski Cc: Will deacon Cc: Marcelo Tosatti Cc: Christian Borntraeger Cc: Luiz Capitulino Cc: Paolo Bonzini Signed-off-by: Frederic Weisbecker --- kernel/context_tracking.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 17715d811b71..a2c0866384e8 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -77,7 +77,6 @@ void context_tracking_enter(enum ctx_state state) local_irq_save(flags); if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { - trace_user_enter(0); /* * At this stage, only low level arch entry code remains and * then we'll run in userspace. We can assume there won't be @@ -85,7 +84,10 @@ void context_tracking_enter(enum ctx_state state) * user_exit() or rcu_irq_enter(). Let's remove RCU's dependency * on the tick. */ - vtime_user_enter(current); + if (state == CONTEXT_USER) { + trace_user_enter(0); + vtime_user_enter(current); + } rcu_user_enter(); } /* @@ -143,8 +145,10 @@ void context_tracking_exit(enum ctx_state state) * RCU core about that (ie: we may need the tick again). */ rcu_user_exit(); - vtime_user_exit(current); - trace_user_exit(0); + if (state == CONTEXT_USER) { + vtime_user_exit(current); + trace_user_exit(0); + } } __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } -- cgit v1.3-8-gc7d7 From efc1e2c9bcbab73797d7bc214014cb916d6a8eb5 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 10 Feb 2015 15:27:53 -0500 Subject: context_tracking: Export context_tracking_user_enter/exit Export context_tracking_user_enter/exit so it can be used by KVM. Reviewed-by: Paul E. McKenney Signed-off-by: Rik van Riel Cc: Paul E. McKenney Cc: Andy Lutomirski Cc: Will deacon Cc: Marcelo Tosatti Cc: Christian Borntraeger Cc: Luiz Capitulino Cc: Paolo Bonzini Signed-off-by: Frederic Weisbecker --- kernel/context_tracking.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index a2c0866384e8..72d59a1a6eb6 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -108,6 +108,7 @@ void context_tracking_enter(enum ctx_state state) local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); +EXPORT_SYMBOL_GPL(context_tracking_enter); void context_tracking_user_enter(void) { @@ -155,6 +156,7 @@ void context_tracking_exit(enum ctx_state state) local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); +EXPORT_SYMBOL_GPL(context_tracking_exit); void context_tracking_user_exit(void) { -- cgit v1.3-8-gc7d7 From b24d443b8f17d9776f5fc1f6c780a0a21eb02913 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 4 Mar 2015 23:10:28 -0500 Subject: ftrace: Clear REGS_EN and TRAMP_EN flags on disabling record via sysctl When /proc/sys/kernel/ftrace_enabled is set to zero, all function tracing is disabled. But the records that represent the functions still hold information about the ftrace_ops that are hooked to them. ftrace_ops may request "REGS" (have a full set of pt_regs passed to the callback), or "TRAMP" (the ops has its own trampoline to use). When the record is updated to represent the state of the ops hooked to it, it sets "REGS_EN" and/or "TRAMP_EN" to state that the callback points to the correct trampoline (REGS has its own trampoline). When ftrace_enabled is set to zero, all ftrace locations are a nop, so they do not point to any trampoline. But the _EN flags are still set. This can cause the accounting to go wrong when ftrace_enabled is cleared and an ops that has a trampoline is registered or unregistered. For example, the following will cause ftrace to crash: # echo function_graph > /sys/kernel/debug/tracing/current_tracer # echo 0 > /proc/sys/kernel/ftrace_enabled # echo nop > /sys/kernel/debug/tracing/current_tracer # echo 1 > /proc/sys/kernel/ftrace_enabled # echo function_graph > /sys/kernel/debug/tracing/current_tracer As function_graph uses a trampoline, when ftrace_enabled is set to zero the updates to the record are not done. When enabling function_graph again, the record will still have the TRAMP_EN flag set, and it will look for an op that has a trampoline other than the function_graph ops, and fail to find one. Cc: stable@vger.kernel.org # 3.17+ Reported-by: Pratyush Anand Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 45e5cb143d17..14947e014b78 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2041,8 +2041,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) if (!ftrace_rec_count(rec)) rec->flags = 0; else - /* Just disable the record (keep REGS state) */ - rec->flags &= ~FTRACE_FL_ENABLED; + /* + * Just disable the record, but keep the ops TRAMP + * and REGS states. The _EN flags must be disabled though. + */ + rec->flags &= ~(FTRACE_FL_ENABLED | FTRACE_FL_TRAMP_EN | + FTRACE_FL_REGS_EN); } return FTRACE_UPDATE_MAKE_NOP; -- cgit v1.3-8-gc7d7 From 1619dc3f8f555ee1cdd3c75db3885d5715442b12 Mon Sep 17 00:00:00 2001 From: Pratyush Anand Date: Fri, 6 Mar 2015 23:58:06 +0530 Subject: ftrace: Fix en(dis)able graph caller when en(dis)abling record via sysctl When ftrace is enabled globally through the proc interface, we must check if ftrace_graph_active is set. If it is set, then we should also pass the FTRACE_START_FUNC_RET command to ftrace_run_update_code(). Similarly, when ftrace is disabled globally through the proc interface, we must check if ftrace_graph_active is set. If it is set, then we should also pass the FTRACE_STOP_FUNC_RET command to ftrace_run_update_code(). Consider the following situation. # echo 0 > /proc/sys/kernel/ftrace_enabled After this ftrace_enabled = 0. # echo function_graph > /sys/kernel/debug/tracing/current_tracer Since ftrace_enabled = 0, ftrace_enable_ftrace_graph_caller() is never called. # echo 1 > /proc/sys/kernel/ftrace_enabled Now ftrace_enabled will be set to true, but still ftrace_enable_ftrace_graph_caller() will not be called, which is not desired. Further if we execute the following after this: # echo nop > /sys/kernel/debug/tracing/current_tracer Now since ftrace_enabled is set it will call ftrace_disable_ftrace_graph_caller(), which causes a kernel warning on the ARM platform. On the ARM platform, when ftrace_enable_ftrace_graph_caller() is called, it checks whether the old instruction is a nop or not. If it's not a nop, then it returns an error. If it is a nop then it replaces instruction at that address with a branch to ftrace_graph_caller. ftrace_disable_ftrace_graph_caller() behaves just the opposite. Therefore, if generic ftrace code ever calls either ftrace_enable_ftrace_graph_caller() or ftrace_disable_ftrace_graph_caller() consecutively two times in a row, then it will return an error, which will cause the generic ftrace code to raise a warning. Note, x86 does not have an issue with this because the architecture specific code for ftrace_enable_ftrace_graph_caller() and ftrace_disable_ftrace_graph_caller() does not check the previous state, and calling either of these functions twice in a row has no ill effect. Link: http://lkml.kernel.org/r/e4fbe64cdac0dd0e86a3bf914b0f83c0b419f146.1425666454.git.panand@redhat.com Cc: stable@vger.kernel.org # 2.6.31+ Signed-off-by: Pratyush Anand [ removed extra if (ftrace_start_up) and defined ftrace_graph_active as 0 if CONFIG_FUNCTION_GRAPH_TRACER is not set. ] Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 14947e014b78..ea520bb54d44 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1059,6 +1059,12 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int ftrace_graph_active; +#else +# define ftrace_graph_active 0 +#endif + #ifdef CONFIG_DYNAMIC_FTRACE static struct ftrace_ops *removed_ops; @@ -2692,24 +2698,36 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) static void ftrace_startup_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* Force update next time */ saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_UPDATE_CALLS); + if (ftrace_start_up) { + command = FTRACE_UPDATE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_START_FUNC_RET; + ftrace_run_update_code(command); + } } static void ftrace_shutdown_sysctl(void) { + int command; + if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ - if (ftrace_start_up) - ftrace_run_update_code(FTRACE_DISABLE_CALLS); + if (ftrace_start_up) { + command = FTRACE_DISABLE_CALLS; + if (ftrace_graph_active) + command |= FTRACE_STOP_FUNC_RET; + ftrace_run_update_code(command); + } } static cycle_t ftrace_update_time; @@ -5594,8 +5612,6 @@ static struct ftrace_ops graph_ops = { ASSIGN_OPS_HASH(graph_ops, &global_ops.local_hash) }; -static int ftrace_graph_active; - int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) { return 0; -- cgit v1.3-8-gc7d7 From 524a38682573b2e15ab6317ccfe50280441514be Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 6 Mar 2015 19:55:13 -0500 Subject: ftrace: Fix ftrace enable ordering of sysctl ftrace_enabled Some archs (specifically PowerPC), are sensitive with the ordering of the enabling of the calls to function tracing and setting of the function to use to be traced. That is, update_ftrace_function() sets what function the ftrace_caller trampoline should call. Some archs require this to be set before calling ftrace_run_update_code(). Another bug was discovered, that ftrace_startup_sysctl() called ftrace_run_update_code() directly. If the function the ftrace_caller trampoline changes, then it will not be updated. Instead a call to ftrace_startup_enable() should be called because it tests to see if the callback changed since the code was disabled, and will tell the arch to update appropriately. Most archs do not need this notification, but PowerPC does. The problem could be seen by the following commands: # echo 0 > /proc/sys/kernel/ftrace_enabled # echo function > /sys/kernel/debug/tracing/current_tracer # echo 1 > /proc/sys/kernel/ftrace_enabled # cat /sys/kernel/debug/tracing/trace The trace will show that function tracing was not active. Cc: stable@vger.kernel.org # 2.6.27+ Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ea520bb54d44..4f228024055b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2710,7 +2710,7 @@ static void ftrace_startup_sysctl(void) command = FTRACE_UPDATE_CALLS; if (ftrace_graph_active) command |= FTRACE_START_FUNC_RET; - ftrace_run_update_code(command); + ftrace_startup_enable(command); } } @@ -5580,12 +5580,12 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, if (ftrace_enabled) { - ftrace_startup_sysctl(); - /* we are starting ftrace again */ if (ftrace_ops_list != &ftrace_list_end) update_ftrace_function(); + ftrace_startup_sysctl(); + } else { /* stopping ftrace calls (just send to ftrace_stub) */ ftrace_trace_function = ftrace_stub; -- cgit v1.3-8-gc7d7 From 44fb085bfa17628c6d2aaa6af6b292a8499e9cbd Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 10 Mar 2015 12:20:00 +0800 Subject: sched/deadline: Add rq->clock update skip for dl task yield This patch adds rq->clock update skip for SCHED_DEADLINE task yield, to tell update_rq_clock() that we've just updated the clock, so that we don't do a microscopic update in schedule() and double the fastpath cost. Signed-off-by: Wanpeng Li Cc: Juri Lelli Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1425961200-3809-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 3fa8fa6d9403..0a81a954c041 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -914,6 +914,12 @@ static void yield_task_dl(struct rq *rq) } update_rq_clock(rq); update_curr_dl(rq); + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() + * and double the fastpath cost. + */ + rq_clock_skip_update(rq, true); } #ifdef CONFIG_SMP -- cgit v1.3-8-gc7d7 From 8038dad7e888581266c76df15d70ca457a3c5910 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 25 Feb 2015 10:34:39 -0800 Subject: smpboot: Add common code for notification from dying CPU RCU ignores offlined CPUs, so they cannot safely run RCU read-side code. (They -can- use SRCU, but not RCU.) This means that any use of RCU during or after the call to arch_cpu_idle_dead(). Unfortunately, commit 2ed53c0d6cc99 added a complete() call, which will contain RCU read-side critical sections if there is a task waiting to be awakened. Which, as it turns out, there almost never is. In my qemu/KVM testing, the to-be-awakened task is not yet asleep more than 99.5% of the time. In current mainline, failure is even harder to reproduce, requiring a virtualized environment that delays the outgoing CPU by at least three jiffies between the time it exits its stop_machine() task at CPU_DYING time and the time it calls arch_cpu_idle_dead() from the idle loop. However, this problem really can occur, especially in virtualized environments, and therefore really does need to be fixed This suggests moving back to the polling loop, but using a much shorter wait, with gentle exponential backoff instead of the old 100-millisecond wait. Most of the time, the loop will exit without waiting at all, and almost all of the remaining uses will wait only five microseconds. If the outgoing CPU is preempted, a loop will wait one jiffy, then increase the wait by a factor of 11/10ths, rounding up. As before, there is a five-second timeout. This commit therefore provides common-code infrastructure to do the dying-to-surviving CPU handoff in a safe manner. This code also provides an indication at CPU-online of whether the CPU to be onlined previously timed out on offline. The new cpu_check_up_prepare() function returns -EBUSY if this CPU previously took more than five seconds to go offline, or -EAGAIN if it has not yet managed to go offline. The rationale for -EAGAIN is that it might still be preempted, so an additional wait might well find it correctly offlined. Architecture-specific code can decide how to handle these conditions. Systems in which CPUs take themselves completely offline might respond to an -EBUSY return as if it was a zero (success) return. Systems in which the surviving CPU must take some action might take it at this time, or might simply mark the other CPU as unusable. Note that architectures that take the easy way out and simply pass the -EBUSY and -EAGAIN upwards will change the sysfs API. Signed-off-by: Paul E. McKenney Cc: Cc: [ paulmck: Fixed state machine for architectures that don't check earlier CPU-hotplug results as suggested by James Hogan. ] --- include/linux/cpu.h | 12 ++++ kernel/smpboot.c | 156 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4260e8594bd7..4744ef915acd 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -95,6 +95,8 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ +#define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, + * perhaps due to preemption. */ /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend * operation in progress @@ -271,4 +273,14 @@ void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); +DECLARE_PER_CPU(bool, cpu_dead_idle); + +int cpu_report_state(int cpu); +int cpu_check_up_prepare(int cpu); +void cpu_set_state_online(int cpu); +#ifdef CONFIG_HOTPLUG_CPU +bool cpu_wait_death(unsigned int cpu, int seconds); +bool cpu_report_death(void); +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ + #endif /* _LINUX_CPU_H_ */ diff --git a/kernel/smpboot.c b/kernel/smpboot.c index 40190f28db35..c697f73d82d6 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread) put_online_cpus(); } EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread); + +static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD); + +/* + * Called to poll specified CPU's state, for example, when waiting for + * a CPU to come online. + */ +int cpu_report_state(int cpu) +{ + return atomic_read(&per_cpu(cpu_hotplug_state, cpu)); +} + +/* + * If CPU has died properly, set its state to CPU_UP_PREPARE and + * return success. Otherwise, return -EBUSY if the CPU died after + * cpu_wait_death() timed out. And yet otherwise again, return -EAGAIN + * if cpu_wait_death() timed out and the CPU still hasn't gotten around + * to dying. In the latter two cases, the CPU might not be set up + * properly, but it is up to the arch-specific code to decide. + * Finally, -EIO indicates an unanticipated problem. + * + * Note that it is permissible to omit this call entirely, as is + * done in architectures that do no CPU-hotplug error checking. + */ +int cpu_check_up_prepare(int cpu) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) { + atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); + return 0; + } + + switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) { + + case CPU_POST_DEAD: + + /* The CPU died properly, so just start it up again. */ + atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE); + return 0; + + case CPU_DEAD_FROZEN: + + /* + * Timeout during CPU death, so let caller know. + * The outgoing CPU completed its processing, but after + * cpu_wait_death() timed out and reported the error. The + * caller is free to proceed, in which case the state + * will be reset properly by cpu_set_state_online(). + * Proceeding despite this -EBUSY return makes sense + * for systems where the outgoing CPUs take themselves + * offline, with no post-death manipulation required from + * a surviving CPU. + */ + return -EBUSY; + + case CPU_BROKEN: + + /* + * The most likely reason we got here is that there was + * a timeout during CPU death, and the outgoing CPU never + * did complete its processing. This could happen on + * a virtualized system if the outgoing VCPU gets preempted + * for more than five seconds, and the user attempts to + * immediately online that same CPU. Trying again later + * might return -EBUSY above, hence -EAGAIN. + */ + return -EAGAIN; + + default: + + /* Should not happen. Famous last words. */ + return -EIO; + } +} + +/* + * Mark the specified CPU online. + * + * Note that it is permissible to omit this call entirely, as is + * done in architectures that do no CPU-hotplug error checking. + */ +void cpu_set_state_online(int cpu) +{ + (void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE); +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* + * Wait for the specified CPU to exit the idle loop and die. + */ +bool cpu_wait_death(unsigned int cpu, int seconds) +{ + int jf_left = seconds * HZ; + int oldstate; + bool ret = true; + int sleep_jf = 1; + + might_sleep(); + + /* The outgoing CPU will normally get done quite quickly. */ + if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD) + goto update_state; + udelay(5); + + /* But if the outgoing CPU dawdles, wait increasingly long times. */ + while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) { + schedule_timeout_uninterruptible(sleep_jf); + jf_left -= sleep_jf; + if (jf_left <= 0) + break; + sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10); + } +update_state: + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); + if (oldstate == CPU_DEAD) { + /* Outgoing CPU died normally, update state. */ + smp_mb(); /* atomic_read() before update. */ + atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD); + } else { + /* Outgoing CPU still hasn't died, set state accordingly. */ + if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + oldstate, CPU_BROKEN) != oldstate) + goto update_state; + ret = false; + } + return ret; +} + +/* + * Called by the outgoing CPU to report its successful death. Return + * false if this report follows the surviving CPU's timing out. + * + * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU + * timed out. This approach allows architectures to omit calls to + * cpu_check_up_prepare() and cpu_set_state_online() without defeating + * the next cpu_wait_death()'s polling loop. + */ +bool cpu_report_death(void) +{ + int oldstate; + int newstate; + int cpu = smp_processor_id(); + + do { + oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu)); + if (oldstate != CPU_BROKEN) + newstate = CPU_DEAD; + else + newstate = CPU_DEAD_FROZEN; + } while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu), + oldstate, newstate) != oldstate); + return newstate == CPU_DEAD; +} + +#endif /* #ifdef CONFIG_HOTPLUG_CPU */ -- cgit v1.3-8-gc7d7 From b33078b6098148c3efdacc907249a247c9d5491e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 16 Jan 2015 14:01:21 -0800 Subject: rcu: Consolidate offline-CPU callback initialization Currently, both rcu_cleanup_dead_cpu() and rcu_send_cbs_to_orphanage() initialize the outgoing CPU's callback list. However, only rcu_cleanup_dead_cpu() invokes rcu_send_cbs_to_orphanage(), and it does so unconditionally, which means that only one of these initializations is required. This commit therefore consolidates the callback-list initialization with the rest of the callback handling in rcu_send_cbs_to_orphanage(). Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 48d640ca1a05..8e020c59ecfd 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2256,8 +2256,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; } - /* Finally, initialize the rcu_data structure's list to empty. */ + /* + * Finally, initialize the rcu_data structure's list to empty and + * disallow further callbacks on this CPU. + */ init_callback_list(rdp); + rdp->nxttail[RCU_NEXT_TAIL] = NULL; } /* @@ -2398,9 +2402,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); - init_callback_list(rdp); - /* Disallow further callbacks on this CPU. */ - rdp->nxttail[RCU_NEXT_TAIL] = NULL; mutex_unlock(&rsp->onoff_mutex); } -- cgit v1.3-8-gc7d7 From 78043c467a91573cc1d51827fe10d7d15ae79a60 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Jan 2015 17:46:24 -0800 Subject: rcu: Put all orphan-callback-related code under same comment Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8e020c59ecfd..98da632d1d49 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2385,9 +2385,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Exclude any attempts to start a new grace period. */ mutex_lock(&rsp->onoff_mutex); - raw_spin_lock_irqsave(&rsp->orphan_lock, flags); /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ + raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); -- cgit v1.3-8-gc7d7 From c8aead6a9b27fdd94b7bcb74b587ae012d8145f2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 16:56:46 -0800 Subject: rcu: Simplify sync_rcu_preempt_exp_init() This commit eliminates a boolean and associated "if" statement by rearranging the code. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a571e9a0f1d..d37c9fbdba71 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -677,19 +677,16 @@ static void sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { unsigned long flags; - int must_wait = 0; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); if (!rcu_preempt_has_tasks(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_report_exp_rnp(rsp, rnp, false); /* No tasks, report. */ } else { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - must_wait = 1; } - if (!must_wait) - rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ } /** -- cgit v1.3-8-gc7d7 From 18c629eaebf1814ca7f0c27327f75aa93aa4a5de Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Jan 2015 18:59:56 -0800 Subject: rcu: Eliminate empty HOTPLUG_CPU ifdef Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index d37c9fbdba71..79376e2461c9 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -520,10 +520,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) WARN_ON_ONCE(rnp->qsmask); } -#ifdef CONFIG_HOTPLUG_CPU - -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Check for a quiescent state from the current CPU. When a task blocks, * the task is recorded in the corresponding CPU's rcu_node structure, -- cgit v1.3-8-gc7d7 From 237a0f2193c6daf9b1edd7fd15d55e680f268952 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jan 2015 14:32:06 -0800 Subject: rcu: Detect stalls caused by failure to propagate up rcu_node tree If all CPUs have passed through quiescent states, then stalls might be due to starvation of the grace-period kthread or to failure to propagate the quiescent states up the rcu_node combining tree. The current stall warning messages do not differentiate, so this commit adds a printout of the root rcu_node structure's ->qsmask field. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 98da632d1d49..3b7e4133ca99 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1196,9 +1196,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) } else { j = jiffies; gpa = ACCESS_ONCE(rsp->gp_activity); - pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", + pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", rsp->name, j - gpa, j, gpa, - jiffies_till_next_fqs); + jiffies_till_next_fqs, + rcu_get_root(rsp)->qsmask); /* In this case, the current CPU might be at fault. */ sched_show_task(current); } -- cgit v1.3-8-gc7d7 From 37745d281069682d901f00c0121949a7d224195f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Jan 2015 18:24:08 -0800 Subject: rcu: Provide diagnostic option to slow down grace-period initialization Grace-period initialization normally proceeds quite quickly, so that it is very difficult to reproduce races against grace-period initialization. This commit therefore allows grace-period initialization to be artificially slowed down, increasing race-reproduction probability. A pair of new Kconfig parameters are provided, CONFIG_RCU_TORTURE_TEST_SLOW_INIT to enable the slowdowns, and CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY to specify the number of jiffies of slowdown to apply. A boot-time parameter named rcutree.gp_init_delay allows boot-time delay to be specified. By default, no delay will be applied even if CONFIG_RCU_TORTURE_TEST_SLOW_INIT is set. Signed-off-by: Paul E. McKenney --- Documentation/kernel-parameters.txt | 6 ++++++ kernel/rcu/tree.c | 10 ++++++++++ lib/Kconfig.debug | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+) (limited to 'kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index bfcb1a62a7b4..94de410ec341 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2968,6 +2968,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Set maximum number of finished RCU callbacks to process in one batch. + rcutree.gp_init_delay= [KNL] + Set the number of jiffies to delay each step of + RCU grace-period initialization. This only has + effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is + set. + rcutree.rcu_fanout_leaf= [KNL] Increase the number of CPUs assigned to each leaf rcu_node structure. Useful for very large diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3b7e4133ca99..b42001fd55fb 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -160,6 +160,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; module_param(kthread_prio, int, 0644); +/* Delay in jiffies for grace-period initialization delays. */ +static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) + ? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY + : 0; +module_param(gp_init_delay, int, 0644); + /* * Track the rcutorture test sequence number and the update version * number within a given test. The rcutorture_testseq is incremented @@ -1769,6 +1775,10 @@ static int rcu_gp_init(struct rcu_state *rsp) raw_spin_unlock_irq(&rnp->lock); cond_resched_rcu_qs(); ACCESS_ONCE(rsp->gp_activity) = jiffies; + if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) && + gp_init_delay > 0 && + !(rsp->gpnum % (rcu_num_nodes * 10))) + schedule_timeout_uninterruptible(gp_init_delay); } mutex_unlock(&rsp->onoff_mutex); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c5cefb3c009c..feee8dab441e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1257,6 +1257,30 @@ config RCU_TORTURE_TEST_RUNNABLE Say N here if you want the RCU torture tests to start only after being manually enabled via /proc. +config RCU_TORTURE_TEST_SLOW_INIT + bool "Slow down RCU grace-period initialization to expose races" + depends on RCU_TORTURE_TEST + help + This option makes grace-period initialization block for a + few jiffies between initializing each pair of consecutive + rcu_node structures. This helps to expose races involving + grace-period initialization, in other words, it makes your + kernel less stable. It can also greatly increase grace-period + latency, especially on systems with large numbers of CPUs. + This is useful when torture-testing RCU, but in almost no + other circumstance. + + Say Y here if you want your system to crash and hang more often. + Say N if you want a sane system. + +config RCU_TORTURE_TEST_SLOW_INIT_DELAY + int "How much to slow down RCU grace-period initialization" + range 0 5 + default 0 + help + This option specifies the number of jiffies to wait between + each rcu_node structure initialization. + config RCU_CPU_STALL_TIMEOUT int "RCU CPU stall timeout in seconds" depends on RCU_STALL_COMMON -- cgit v1.3-8-gc7d7 From 999c286347538388170f919146d7cfa58689472e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 31 Jan 2015 21:12:02 -0800 Subject: rcu: Remove event tracing from rcu_cpu_notify(), used by offline CPUs Offline CPUs cannot safely invoke trace events, but such CPUs do execute within rcu_cpu_notify(). Therefore, this commit removes the trace events from rcu_cpu_notify(). These trace events are for utilization, against which rcu_cpu_notify() execution time should be negligible. Reported-by: Fengguang Wu Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b42001fd55fb..a7151d26b940 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3629,7 +3629,6 @@ static int rcu_cpu_notify(struct notifier_block *self, struct rcu_node *rnp = rdp->mynode; struct rcu_state *rsp; - trace_rcu_utilization(TPS("Start CPU hotplug")); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: @@ -3661,7 +3660,6 @@ static int rcu_cpu_notify(struct notifier_block *self, default: break; } - trace_rcu_utilization(TPS("End CPU hotplug")); return NOTIFY_OK; } -- cgit v1.3-8-gc7d7 From 6086e346fdea1ae64d974c94c1acacc2605567ae Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:29 -0700 Subject: clocksource: Simplify the clocks_calc_max_nsecs() logic The previous clocks_calc_max_nsecs() code had some unecessarily complex bit logic to find the max interval that could cause multiplication overflows. Since this is not in the hot path, just do the divide to make it easier to read. The previous implementation also had a subtle issue that it avoided overflows with signed 64-bit values, where as the intervals are always unsigned. This resulted in overly conservative intervals, which other safety margins were then added to, reducing the intended interval length. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 4892352f0e49..2148f413256c 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -476,19 +476,10 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) /* * Calculate the maximum number of cycles that we can pass to the - * cyc2ns function without overflowing a 64-bit signed result. The - * maximum number of cycles is equal to ULLONG_MAX/(mult+maxadj) - * which is equivalent to the below. - * max_cycles < (2^63)/(mult + maxadj) - * max_cycles < 2^(log2((2^63)/(mult + maxadj))) - * max_cycles < 2^(log2(2^63) - log2(mult + maxadj)) - * max_cycles < 2^(63 - log2(mult + maxadj)) - * max_cycles < 1 << (63 - log2(mult + maxadj)) - * Please note that we add 1 to the result of the log2 to account for - * any rounding errors, ensure the above inequality is satisfied and - * no overflow will occur. + * cyc2ns() function without overflowing a 64-bit result. */ - max_cycles = 1ULL << (63 - (ilog2(mult + maxadj) + 1)); + max_cycles = ULLONG_MAX; + do_div(max_cycles, mult+maxadj); /* * The actual maximum number of cycles we can defer the clocksource is -- cgit v1.3-8-gc7d7 From 362fde0410377e468ca00ad363fdf3e3ec42eb6a Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:30 -0700 Subject: clocksource: Simplify the logic around clocksource wrapping safety margins The clocksource logic has a number of places where we try to include a safety margin. Most of these are 12% safety margins, but they are inconsistently applied and sometimes are applied on top of each other. Additionally, in the previous patch, we corrected an issue where we unintentionally in effect created a 50% safety margin, which these 12.5% margins where then added to. So to simplify the logic here, this patch removes the various 12.5% margins, and consolidates adding the margin in one place: clocks_calc_max_nsecs(). Additionally, Linus prefers a 50% safety margin, as it allows bad clock values to be more easily caught. This should really have no net effect, due to the corrected issue earlier which caused greater then 50% margins to be used w/o issue. Signed-off-by: John Stultz Acked-by: Stephen Boyd (for the sched_clock.c bit) Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 26 ++++++++++++-------------- kernel/time/sched_clock.c | 4 ++-- 2 files changed, 14 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 2148f413256c..ace95763b3a6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -469,6 +469,9 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * @shift: cycle to nanosecond divisor (power of two) * @maxadj: maximum adjustment value to mult (~11%) * @mask: bitmask for two's complement subtraction of non 64 bit counters + * + * NOTE: This function includes a safety margin of 50%, so that bad clock values + * can be detected. */ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) { @@ -490,11 +493,14 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) max_cycles = min(max_cycles, mask); max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + /* Return 50% of the actual maximum, so we can detect bad values */ + max_nsecs >>= 1; + return max_nsecs; } /** - * clocksource_max_deferment - Returns max time the clocksource can be deferred + * clocksource_max_deferment - Returns max time the clocksource should be deferred * @cs: Pointer to clocksource * */ @@ -504,13 +510,7 @@ static u64 clocksource_max_deferment(struct clocksource *cs) max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, cs->mask); - /* - * To ensure that the clocksource does not wrap whilst we are idle, - * limit the time the clocksource can be deferred by 12.5%. Please - * note a margin of 12.5% is used because this can be computed with - * a shift, versus say 10% which would require division. - */ - return max_nsecs - (max_nsecs >> 3); + return max_nsecs; } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -659,10 +659,9 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) * conversion precision. 10 minutes is still a reasonable * amount. That results in a shift value of 24 for a * clocksource with mask >= 40bit and f >= 4GHz. That maps to - * ~ 0.06ppm granularity for NTP. We apply the same 12.5% - * margin as we do in clocksource_max_deferment() + * ~ 0.06ppm granularity for NTP. */ - sec = (cs->mask - (cs->mask >> 3)); + sec = cs->mask; do_div(sec, freq); do_div(sec, scale); if (!sec) @@ -674,9 +673,8 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) NSEC_PER_SEC / scale, sec * scale); /* - * for clocksources that have large mults, to avoid overflow. - * Since mult may be adjusted by ntp, add an safety extra margin - * + * Ensure clocksources that have large 'mult' values don't overflow + * when adjusted. */ cs->maxadj = clocksource_max_adjustment(cs); while ((cs->mult + cs->maxadj < cs->mult) diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 01d2d15aa662..3b8ae45020c1 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -125,9 +125,9 @@ void __init sched_clock_register(u64 (*read)(void), int bits, new_mask = CLOCKSOURCE_MASK(bits); - /* calculate how many ns until we wrap */ + /* calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); - new_wrap_kt = ns_to_ktime(wrap - (wrap >> 3)); + new_wrap_kt = ns_to_ktime(wrap); /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); -- cgit v1.3-8-gc7d7 From fb82fe2fe8588745edd73aa3a6229facac5c1e15 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:31 -0700 Subject: clocksource: Add 'max_cycles' to 'struct clocksource' In order to facilitate clocksource validation, add a 'max_cycles' field to the clocksource structure which will hold the maximum cycle value that can safely be multiplied without potentially causing an overflow. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-4-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/clocksource.h | 5 +++-- kernel/time/clocksource.c | 28 ++++++++++++++++------------ kernel/time/sched_clock.c | 2 +- 3 files changed, 20 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 9c78d15d33e4..16d048cadebb 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -56,6 +56,7 @@ struct module; * @shift: cycle to nanosecond divisor (power of two) * @max_idle_ns: max idle time permitted by the clocksource (nsecs) * @maxadj: maximum adjustment value to mult (~11%) + * @max_cycles: maximum safe cycle value which won't overflow on multiplication * @flags: flags describing special properties * @archdata: arch-specific data * @suspend: suspend function for the clocksource, if necessary @@ -76,7 +77,7 @@ struct clocksource { #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA struct arch_clocksource_data archdata; #endif - + u64 max_cycles; const char *name; struct list_head list; int rating; @@ -189,7 +190,7 @@ extern struct clocksource * __init clocksource_default_clock(void); extern void clocksource_mark_unstable(struct clocksource *cs); extern u64 -clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask); +clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cycles); extern void clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ace95763b3a6..fc2a9de43ca1 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -469,11 +469,13 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * @shift: cycle to nanosecond divisor (power of two) * @maxadj: maximum adjustment value to mult (~11%) * @mask: bitmask for two's complement subtraction of non 64 bit counters + * @max_cyc: maximum cycle value before potential overflow (does not include + * any safety margin) * * NOTE: This function includes a safety margin of 50%, so that bad clock values * can be detected. */ -u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) +u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { u64 max_nsecs, max_cycles; @@ -493,6 +495,10 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) max_cycles = min(max_cycles, mask); max_nsecs = clocksource_cyc2ns(max_cycles, mult - maxadj, shift); + /* return the max_cycles value as well if requested */ + if (max_cyc) + *max_cyc = max_cycles; + /* Return 50% of the actual maximum, so we can detect bad values */ max_nsecs >>= 1; @@ -500,17 +506,15 @@ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask) } /** - * clocksource_max_deferment - Returns max time the clocksource should be deferred - * @cs: Pointer to clocksource + * clocksource_update_max_deferment - Updates the clocksource max_idle_ns & max_cycles + * @cs: Pointer to clocksource to be updated * */ -static u64 clocksource_max_deferment(struct clocksource *cs) +static inline void clocksource_update_max_deferment(struct clocksource *cs) { - u64 max_nsecs; - - max_nsecs = clocks_calc_max_nsecs(cs->mult, cs->shift, cs->maxadj, - cs->mask); - return max_nsecs; + cs->max_idle_ns = clocks_calc_max_nsecs(cs->mult, cs->shift, + cs->maxadj, cs->mask, + &cs->max_cycles); } #ifndef CONFIG_ARCH_USES_GETTIMEOFFSET @@ -684,7 +688,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) cs->maxadj = clocksource_max_adjustment(cs); } - cs->max_idle_ns = clocksource_max_deferment(cs); + clocksource_update_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -730,8 +734,8 @@ int clocksource_register(struct clocksource *cs) "Clocksource %s might overflow on 11%% adjustment\n", cs->name); - /* calculate max idle time permitted for this clocksource */ - cs->max_idle_ns = clocksource_max_deferment(cs); + /* Update max idle time permitted for this clocksource */ + clocksource_update_max_deferment(cs); mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 3b8ae45020c1..ca3bc5c7027c 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -126,7 +126,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, new_mask = CLOCKSOURCE_MASK(bits); /* calculate how many nanosecs until we risk wrapping */ - wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask); + wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); new_wrap_kt = ns_to_ktime(wrap); /* update epoch for new counter and update epoch_ns from old counter*/ -- cgit v1.3-8-gc7d7 From 80f1d68ccba70b1060c9c7360ca83da430f66bed Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Thu, 12 Mar 2015 17:21:42 +0100 Subject: ebpf: verifier: check that call reg with ARG_ANYTHING is initialized I noticed that a helper function with argument type ARG_ANYTHING does not need to have an initialized value (register). This can worst case lead to unintented stack memory leakage in future helper functions if they are not carefully designed, or unintended application behaviour in case the application developer was not careful enough to match a correct helper function signature in the API. The underlying issue is that ARG_ANYTHING should actually be split into two different semantics: 1) ARG_DONTCARE for function arguments that the helper function does not care about (in other words: the default for unused function arguments), and 2) ARG_ANYTHING that is an argument actually being used by a helper function and *guaranteed* to be an initialized register. The current risk is low: ARG_ANYTHING is only used for the 'flags' argument (r4) in bpf_map_update_elem() that internally does strict checking. Fixes: 17a5267067f3 ("bpf: verifier (add verifier core)") Signed-off-by: Daniel Borkmann Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 4 +++- kernel/bpf/verifier.c | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index a884f5a2c503..80f2e0fc3d02 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -44,7 +44,7 @@ struct bpf_map_type_list { /* function argument constraints */ enum bpf_arg_type { - ARG_ANYTHING = 0, /* any argument is ok */ + ARG_DONTCARE = 0, /* unused argument in helper function */ /* the following constraints used to prototype * bpf_map_lookup/update/delete_elem() functions @@ -58,6 +58,8 @@ enum bpf_arg_type { */ ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ + + ARG_ANYTHING, /* any (initialized) argument is ok */ }; /* type of values returned from helper functions */ diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index bdf4192a889b..e6b522496250 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -755,7 +755,7 @@ static int check_func_arg(struct verifier_env *env, u32 regno, enum bpf_reg_type expected_type; int err = 0; - if (arg_type == ARG_ANYTHING) + if (arg_type == ARG_DONTCARE) return 0; if (reg->type == NOT_INIT) { @@ -763,6 +763,9 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return -EACCES; } + if (arg_type == ARG_ANYTHING) + return 0; + if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; -- cgit v1.3-8-gc7d7 From 8eb74b2b291e7bf6aa59fcb4e59f236382f00bf5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 13 Feb 2015 10:52:28 -0800 Subject: rcu: Rework preemptible expedited bitmask handling Currently, the rcu_node tree ->expmask bitmasks are initially set to reflect the online CPUs. This is pointless, because only the CPUs preempted within RCU read-side critical sections by the preceding synchronize_sched_expedited() need to be tracked. This commit therefore instead sets up these bitmasks based on the state of the ->blkd_tasks lists. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree_plugin.h | 98 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 75 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 79376e2461c9..a22721547442 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -626,9 +626,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) * recursively up the tree. (Calm down, calm down, we do the recursion * iteratively!) * - * Most callers will set the "wake" flag, but the task initiating the - * expedited grace period need not wake itself. - * * Caller must hold sync_rcu_preempt_exp_mutex. */ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, @@ -663,26 +660,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, /* * Snapshot the tasks blocking the newly started preemptible-RCU expedited - * grace period for the specified rcu_node structure. If there are no such - * tasks, report it up the rcu_node hierarchy. + * grace period for the specified rcu_node structure, phase 1. If there + * are such tasks, set the ->expmask bits up the rcu_node tree and also + * set the ->expmask bits on the leaf rcu_node structures to tell phase 2 + * that work is needed here. * - * Caller must hold sync_rcu_preempt_exp_mutex and must exclude - * CPU hotplug operations. + * Caller must hold sync_rcu_preempt_exp_mutex. */ static void -sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) +sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp) { unsigned long flags; + unsigned long mask; + struct rcu_node *rnp_up; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rnp->expmask); + WARN_ON_ONCE(rnp->exp_tasks); if (!rcu_preempt_has_tasks(rnp)) { + /* No blocked tasks, nothing to do. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); - rcu_report_exp_rnp(rsp, rnp, false); /* No tasks, report. */ - } else { + return; + } + /* Call for Phase 2 and propagate ->expmask bits up the tree. */ + rnp->expmask = 1; + rnp_up = rnp; + while (rnp_up->parent) { + mask = rnp_up->grpmask; + rnp_up = rnp_up->parent; + if (rnp_up->expmask & mask) + break; + raw_spin_lock(&rnp_up->lock); /* irqs already off */ + smp_mb__after_unlock_lock(); + rnp_up->expmask |= mask; + raw_spin_unlock(&rnp_up->lock); /* irqs still off */ + } + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + +/* + * Snapshot the tasks blocking the newly started preemptible-RCU expedited + * grace period for the specified rcu_node structure, phase 2. If the + * leaf rcu_node structure has its ->expmask field set, check for tasks. + * If there are some, clear ->expmask and set ->exp_tasks accordingly, + * then initiate RCU priority boosting. Otherwise, clear ->expmask and + * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits, + * enabling rcu_read_unlock_special() to do the bit-clearing. + * + * Caller must hold sync_rcu_preempt_exp_mutex. + */ +static void +sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); + if (!rnp->expmask) { + /* Phase 1 didn't do anything, so Phase 2 doesn't either. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; + } + + /* Phase 1 is over. */ + rnp->expmask = 0; + + /* + * If there are still blocked tasks, set up ->exp_tasks so that + * rcu_read_unlock_special() will wake us and then boost them. + */ + if (rcu_preempt_has_tasks(rnp)) { rnp->exp_tasks = rnp->blkd_tasks.next; rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ + return; } + + /* No longer any blocked tasks, so undo bit setting. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); + rcu_report_exp_rnp(rsp, rnp, false); } /** @@ -699,7 +755,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) */ void synchronize_rcu_expedited(void) { - unsigned long flags; struct rcu_node *rnp; struct rcu_state *rsp = &rcu_preempt_state; unsigned long snap; @@ -750,19 +805,16 @@ void synchronize_rcu_expedited(void) /* force all RCU readers onto ->blkd_tasks lists. */ synchronize_sched_expedited(); - /* Initialize ->expmask for all non-leaf rcu_node structures. */ - rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); - rnp->expmask = rnp->qsmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - } - - /* Snapshot current state of ->blkd_tasks lists. */ + /* + * Snapshot current state of ->blkd_tasks lists into ->expmask. + * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special() + * to start clearing them. Doing this in one phase leads to + * strange races between setting and clearing bits, so just say "no"! + */ + rcu_for_each_leaf_node(rsp, rnp) + sync_rcu_preempt_exp_init1(rsp, rnp); rcu_for_each_leaf_node(rsp, rnp) - sync_rcu_preempt_exp_init(rsp, rnp); - if (NUM_RCU_NODES > 1) - sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); + sync_rcu_preempt_exp_init2(rsp, rnp); put_online_cpus(); -- cgit v1.3-8-gc7d7 From cc99a310caf811aebbd0986f433d824e4a5e7ce5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 23 Feb 2015 08:59:29 -0800 Subject: rcu: Move rcu_report_unblock_qs_rnp() to common code The rcu_report_unblock_qs_rnp() function is invoked when the last task blocking the current grace period exits its outermost RCU read-side critical section. Previously, this was called only from rcu_read_unlock_special(), and was therefore defined only when CONFIG_RCU_PREEMPT=y. However, this function will be invoked even when CONFIG_RCU_PREEMPT=n once CPU-hotplug operations are processed only at the beginnings of RCU grace periods. The reason for this change is that the last task on a given leaf rcu_node structure's ->blkd_tasks list might well exit its RCU read-side critical section between the time that recent CPU-hotplug operations were applied and when the new grace period was initialized. This situation could result in RCU waiting forever on that leaf rcu_node structure, because if all that structure's CPUs were already offline, there would be no quiescent-state events to drive that structure's part of the grace period. This commit therefore moves rcu_report_unblock_qs_rnp() to common code that is built unconditionally so that the quiescent-state-forcing code can clean up after this situation, avoiding the grace-period stall. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 39 +++++++++++++++++++++++++++++++++++++++ kernel/rcu/tree_plugin.h | 40 ++-------------------------------------- 2 files changed, 41 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index a7151d26b940..5b5cb1ff73ed 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2126,6 +2126,45 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ } +/* + * Record a quiescent state for all tasks that were previously queued + * on the specified rcu_node structure and that were blocking the current + * RCU grace period. The caller must hold the specified rnp->lock with + * irqs disabled, and this lock is released upon return, but irqs remain + * disabled. + */ +static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp, + struct rcu_node *rnp, unsigned long flags) + __releases(rnp->lock) +{ + unsigned long mask; + struct rcu_node *rnp_p; + + WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state); + if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; /* Still need more quiescent states! */ + } + + rnp_p = rnp->parent; + if (rnp_p == NULL) { + /* + * Either there is only one rcu_node in the tree, + * or tasks were kicked up to root rcu_node due to + * CPUs going offline. + */ + rcu_report_qs_rsp(rsp, flags); + return; + } + + /* Report up the rest of the hierarchy. */ + mask = rnp->grpmask; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); + rcu_report_qs_rnp(mask, rsp, rnp_p, flags); +} + /* * Record a quiescent state for the specified CPU to that CPU's rcu_data * structure. This must be either called from the specified CPU, or diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index a22721547442..ec6c2efb28cd 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -232,43 +232,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return rnp->gp_tasks != NULL; } -/* - * Record a quiescent state for all tasks that were previously queued - * on the specified rcu_node structure and that were blocking the current - * RCU grace period. The caller must hold the specified rnp->lock with - * irqs disabled, and this lock is released upon return, but irqs remain - * disabled. - */ -static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) - __releases(rnp->lock) -{ - unsigned long mask; - struct rcu_node *rnp_p; - - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); - return; /* Still need more quiescent states! */ - } - - rnp_p = rnp->parent; - if (rnp_p == NULL) { - /* - * Either there is only one rcu_node in the tree, - * or tasks were kicked up to root rcu_node due to - * CPUs going offline. - */ - rcu_report_qs_rsp(&rcu_preempt_state, flags); - return; - } - - /* Report up the rest of the hierarchy. */ - mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); -} - /* * Advance a ->blkd_tasks-list pointer to the next entry, instead * returning NULL if at the end of the list. @@ -399,7 +362,8 @@ void rcu_read_unlock_special(struct task_struct *t) rnp->grplo, rnp->grphi, !!rnp->gp_tasks); - rcu_report_unblock_qs_rnp(rnp, flags); + rcu_report_unblock_qs_rnp(&rcu_preempt_state, + rnp, flags); } else { raw_spin_unlock_irqrestore(&rnp->lock, flags); } -- cgit v1.3-8-gc7d7 From 0aa04b055e71bd3b8040dd71a126126c66b6f01e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jan 2015 21:52:37 -0800 Subject: rcu: Process offlining and onlining only at grace-period start Races between CPU hotplug and grace periods can be difficult to resolve, so the ->onoff_mutex is used to exclude the two events. Unfortunately, this means that it is impossible for an outgoing CPU to perform the last bits of its offlining from its last pass through the idle loop, because sleeplocks cannot be acquired in that context. This commit avoids these problems by buffering online and offline events in a new ->qsmaskinitnext field in the leaf rcu_node structures. When a grace period starts, the events accumulated in this mask are applied to the ->qsmaskinit field, and, if needed, up the rcu_node tree. The special case of all CPUs corresponding to a given leaf rcu_node structure being offline while there are still elements in that structure's ->blkd_tasks list is handled using a new ->wait_blkd_tasks field. In this case, propagating the offline bits up the tree is deferred until the beginning of the grace period after all of the tasks have exited their RCU read-side critical sections and removed themselves from the list, at which point the ->wait_blkd_tasks flag is cleared. If one of that leaf rcu_node structure's CPUs comes back online before the list empties, then the ->wait_blkd_tasks flag is simply cleared. This of course means that RCU's notion of which CPUs are offline can be out of date. This is OK because RCU need only wait on CPUs that were online at the time that the grace period started. In addition, RCU's force-quiescent-state actions will handle the case where a CPU goes offline after the grace period starts. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 154 +++++++++++++++++++++++++++++++++++++---------- kernel/rcu/tree.h | 9 +++ kernel/rcu/tree_plugin.h | 22 ++----- kernel/rcu/tree_trace.c | 4 +- 4 files changed, 136 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 5b5cb1ff73ed..f0f4d3510d24 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); */ static int rcu_scheduler_fully_active __read_mostly; +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); +static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); static void invoke_rcu_core(void); static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); @@ -178,6 +180,17 @@ module_param(gp_init_delay, int, 0644); unsigned long rcutorture_testseq; unsigned long rcutorture_vernum; +/* + * Compute the mask of online CPUs for the specified rcu_node structure. + * This will not be stable unless the rcu_node structure's ->lock is + * held, but the bit corresponding to the current CPU will be stable + * in most contexts. + */ +unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) +{ + return ACCESS_ONCE(rnp->qsmaskinitnext); +} + /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -960,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void) preempt_disable(); rdp = this_cpu_ptr(&rcu_sched_data); rnp = rdp->mynode; - ret = (rdp->grpmask & rnp->qsmaskinit) || + ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || !rcu_scheduler_fully_active; preempt_enable(); return ret; @@ -1710,6 +1723,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) */ static int rcu_gp_init(struct rcu_state *rsp) { + unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); @@ -1744,6 +1758,55 @@ static int rcu_gp_init(struct rcu_state *rsp) mutex_lock(&rsp->onoff_mutex); smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ + /* + * Apply per-leaf buffered online and offline operations to the + * rcu_node tree. Note that this new grace period need not wait + * for subsequent online CPUs, and that quiescent-state forcing + * will handle subsequent offline CPUs. + */ + rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irq(&rnp->lock); + smp_mb__after_unlock_lock(); + if (rnp->qsmaskinit == rnp->qsmaskinitnext && + !rnp->wait_blkd_tasks) { + /* Nothing to do on this leaf rcu_node structure. */ + raw_spin_unlock_irq(&rnp->lock); + continue; + } + + /* Record old state, apply changes to ->qsmaskinit field. */ + oldmask = rnp->qsmaskinit; + rnp->qsmaskinit = rnp->qsmaskinitnext; + + /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ + if (!oldmask != !rnp->qsmaskinit) { + if (!oldmask) /* First online CPU for this rcu_node. */ + rcu_init_new_rnp(rnp); + else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ + rnp->wait_blkd_tasks = true; + else /* Last offline CPU and can propagate. */ + rcu_cleanup_dead_rnp(rnp); + } + + /* + * If all waited-on tasks from prior grace period are + * done, and if all this rcu_node structure's CPUs are + * still offline, propagate up the rcu_node tree and + * clear ->wait_blkd_tasks. Otherwise, if one of this + * rcu_node structure's CPUs has since come back online, + * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() + * checks for this, so just call it unconditionally). + */ + if (rnp->wait_blkd_tasks && + (!rcu_preempt_has_tasks(rnp) || + rnp->qsmaskinit)) { + rnp->wait_blkd_tasks = false; + rcu_cleanup_dead_rnp(rnp); + } + + raw_spin_unlock_irq(&rnp->lock); + } + /* * Set the quiescent-state-needed bits in all the rcu_node * structures for all currently online CPUs in breadth-first order, @@ -2133,7 +2196,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * irqs disabled, and this lock is released upon return, but irqs remain * disabled. */ -static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp, +static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { @@ -2409,6 +2472,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) raw_spin_lock(&rnp->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); /* GP memory ordering. */ rnp->qsmaskinit &= ~mask; + rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ return; @@ -2427,6 +2491,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; + unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2443,12 +2508,12 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinit &= ~rdp->grpmask; - if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp)) - rcu_cleanup_dead_rnp(rnp); - rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); + WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2654,12 +2719,21 @@ static void force_qs_rnp(struct rcu_state *rsp, } } if (mask != 0) { - - /* rcu_report_qs_rnp() releases rnp->lock. */ + /* Idle/offline CPUs, report. */ rcu_report_qs_rnp(mask, rsp, rnp, flags); - continue; + } else if (rnp->parent && + list_empty(&rnp->blkd_tasks) && + !rnp->qsmask && + (rnp->parent->qsmask & rnp->grpmask)) { + /* + * Race between grace-period initialization and task + * existing RCU read-side critical section, report. + */ + rcu_report_unblock_qs_rnp(rsp, rnp, flags); + } else { + /* Nothing to do here, so just drop the lock. */ + raw_spin_unlock_irqrestore(&rnp->lock, flags); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); } } @@ -3568,6 +3642,28 @@ void rcu_barrier_sched(void) } EXPORT_SYMBOL_GPL(rcu_barrier_sched); +/* + * Propagate ->qsinitmask bits up the rcu_node tree to account for the + * first CPU in a given leaf rcu_node structure coming online. The caller + * must hold the corresponding leaf rcu_node ->lock with interrrupts + * disabled. + */ +static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) +{ + long mask; + struct rcu_node *rnp = rnp_leaf; + + for (;;) { + mask = rnp->grpmask; + rnp = rnp->parent; + if (rnp == NULL) + return; + raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + rnp->qsmaskinit |= mask; + raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + } +} + /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -3620,31 +3716,23 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - /* Add CPU to rcu_node bitmasks. */ + /* + * Add CPU to leaf rcu_node pending-online bitmask. Any needed + * propagation up the rcu_node tree will happen at the beginning + * of the next grace period. + */ rnp = rdp->mynode; mask = rdp->grpmask; - do { - /* Exclude any attempts to start a new GP on small systems. */ - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - rnp->qsmaskinit |= mask; - mask = rnp->grpmask; - if (rnp == rdp->mynode) { - /* - * If there is a grace period in progress, we will - * set up to wait for it next time we run the - * RCU core code. - */ - rdp->gpnum = rnp->completed; - rdp->completed = rnp->completed; - rdp->passed_quiesce = 0; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - rdp->qs_pending = 0; - trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - } - raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ - rnp = rnp->parent; - } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - local_irq_restore(flags); + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ + smp_mb__after_unlock_lock(); + rnp->qsmaskinitnext |= mask; + rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ + rdp->completed = rnp->completed; + rdp->passed_quiesce = false; + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->qs_pending = false; + trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); + raw_spin_unlock_irqrestore(&rnp->lock, flags); mutex_unlock(&rsp->onoff_mutex); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 119de399eb2f..aa42562ff5b2 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -141,12 +141,20 @@ struct rcu_node { /* complete (only for PREEMPT_RCU). */ unsigned long qsmaskinit; /* Per-GP initial value for qsmask & expmask. */ + /* Initialized from ->qsmaskinitnext at the */ + /* beginning of each grace period. */ + unsigned long qsmaskinitnext; + /* Online CPUs for next grace period. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ int grphi; /* highest-numbered CPU or group here. */ u8 grpnum; /* CPU/group number for next level up. */ u8 level; /* root is at level 0. */ + bool wait_blkd_tasks;/* Necessary to wait for blocked tasks to */ + /* exit RCU read-side critical sections */ + /* before propagating offline up the */ + /* rcu_node tree? */ struct rcu_node *parent; struct list_head blkd_tasks; /* Tasks blocked in RCU read-side critical */ @@ -559,6 +567,7 @@ static void rcu_prepare_kthreads(int cpu); static void rcu_cleanup_after_idle(void); static void rcu_prepare_for_idle(void); static void rcu_idle_count_callbacks_posted(void); +static bool rcu_preempt_has_tasks(struct rcu_node *rnp); static void print_cpu_stall_info_begin(void); static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); static void print_cpu_stall_info_end(void); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index ec6c2efb28cd..d45e961515c1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -180,7 +180,7 @@ static void rcu_preempt_note_context_switch(void) * But first, note that the current CPU must still be * on line! */ - WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); + WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0); WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); @@ -263,7 +263,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) */ void rcu_read_unlock_special(struct task_struct *t) { - bool empty; bool empty_exp; bool empty_norm; bool empty_exp_now; @@ -319,7 +318,6 @@ void rcu_read_unlock_special(struct task_struct *t) break; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } - empty = !rcu_preempt_has_tasks(rnp); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -339,14 +337,6 @@ void rcu_read_unlock_special(struct task_struct *t) drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; #endif /* #ifdef CONFIG_RCU_BOOST */ - /* - * If this was the last task on the list, go see if we - * need to propagate ->qsmaskinit bit clearing up the - * rcu_node tree. - */ - if (!empty && !rcu_preempt_has_tasks(rnp)) - rcu_cleanup_dead_rnp(rnp); - /* * If this was the last task on the current list, and if * we aren't waiting on any CPUs, report the quiescent state. @@ -868,8 +858,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) return 0; } -#ifdef CONFIG_HOTPLUG_CPU - /* * Because there is no preemptible RCU, there can be no readers blocked. */ @@ -878,8 +866,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp) return false; } -#endif /* #ifdef CONFIG_HOTPLUG_CPU */ - /* * Because preemptible RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. @@ -1179,7 +1165,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) * Returns zero if all is well, a negated errno otherwise. */ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, - struct rcu_node *rnp) + struct rcu_node *rnp) { int rnp_index = rnp - &rsp->node[0]; unsigned long flags; @@ -1189,7 +1175,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (&rcu_preempt_state != rsp) return 0; - if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) + if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0) return 0; rsp->boost = 1; @@ -1282,7 +1268,7 @@ static void rcu_cpu_kthread(unsigned int cpu) static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) { struct task_struct *t = rnp->boost_kthread_task; - unsigned long mask = rnp->qsmaskinit; + unsigned long mask = rcu_rnp_online_cpus(rnp); cpumask_var_t cm; int cpu; diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index fbb6240509ea..f92361efd0f5 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_puts(m, "\n"); level = rnp->level; } - seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", - rnp->qsmask, rnp->qsmaskinit, + seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d ", + rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext, ".G"[rnp->gp_tasks != NULL], ".E"[rnp->exp_tasks != NULL], ".T"[!list_empty(&rnp->blkd_tasks)], -- cgit v1.3-8-gc7d7 From c199068913c9c5cbb5498e289bb387703e087ea8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Jan 2015 22:29:37 -0800 Subject: rcu: Eliminate ->onoff_mutex from rcu_node structure Because that RCU grace-period initialization need no longer exclude CPU-hotplug operations, this commit eliminates the ->onoff_mutex and its uses. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 15 --------------- kernel/rcu/tree.h | 2 -- 2 files changed, 17 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f0f4d3510d24..79d53399247e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -101,7 +101,6 @@ struct rcu_state sname##_state = { \ .orphan_nxttail = &sname##_state.orphan_nxtlist, \ .orphan_donetail = &sname##_state.orphan_donelist, \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ - .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ }; \ @@ -1754,10 +1753,6 @@ static int rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); raw_spin_unlock_irq(&rnp->lock); - /* Exclude any concurrent CPU-hotplug operations. */ - mutex_lock(&rsp->onoff_mutex); - smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */ - /* * Apply per-leaf buffered online and offline operations to the * rcu_node tree. Note that this new grace period need not wait @@ -1844,7 +1839,6 @@ static int rcu_gp_init(struct rcu_state *rsp) schedule_timeout_uninterruptible(gp_init_delay); } - mutex_unlock(&rsp->onoff_mutex); return 1; } @@ -2498,9 +2492,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) /* Adjust any no-longer-needed kthreads. */ rcu_boost_kthread_setaffinity(rnp, -1); - /* Exclude any attempts to start a new grace period. */ - mutex_lock(&rsp->onoff_mutex); - /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ raw_spin_lock_irqsave(&rsp->orphan_lock, flags); rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); @@ -2517,7 +2508,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); - mutex_unlock(&rsp->onoff_mutex); } #else /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -3700,9 +3690,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rcu_get_root(rsp); - /* Exclude new grace periods. */ - mutex_lock(&rsp->onoff_mutex); - /* Set up local state, ensuring consistent view of global state. */ raw_spin_lock_irqsave(&rnp->lock, flags); rdp->beenonline = 1; /* We have now been online. */ @@ -3733,8 +3720,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qs_pending = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore(&rnp->lock, flags); - - mutex_unlock(&rsp->onoff_mutex); } static void rcu_prepare_cpu(int cpu) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index aa42562ff5b2..a69d3dab2ec4 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -456,8 +456,6 @@ struct rcu_state { long qlen; /* Total number of callbacks. */ /* End of fields guarded by orphan_lock. */ - struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ - struct mutex barrier_mutex; /* Guards barrier fields. */ atomic_t barrier_cpu_count; /* # CPUs waiting on. */ struct completion barrier_completion; /* Wake at barrier end. */ -- cgit v1.3-8-gc7d7 From 528a25b00e1f84eaba6c98e63f58ee0a8e472102 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jan 2015 14:09:43 -0800 Subject: cpu: Make CPU-offline idle-loop transition point more precise This commit uses a per-CPU variable to make the CPU-offline code path through the idle loop more precise, so that the outgoing CPU is guaranteed to make it into the idle loop before it is powered off. This commit is in preparation for putting the RCU offline-handling code on this code path, which will eliminate the magic one-jiffy wait that RCU uses as the maximum time for an outgoing CPU to get all the way through the scheduler. The magic one-jiffy wait for incoming CPUs remains a separate issue. Signed-off-by: Paul E. McKenney --- kernel/cpu.c | 4 +++- kernel/sched/idle.c | 7 ++++++- 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 1972b161c61e..d46b4dae0ca0 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -408,8 +408,10 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) * * Wait for the stop thread to go away. */ - while (!idle_cpu(cpu)) + while (!per_cpu(cpu_dead_idle, cpu)) cpu_relax(); + smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ + per_cpu(cpu_dead_idle, cpu) = false; /* This actually kills the CPU. */ __cpu_die(cpu); diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 94b2d7b88a27..e99e361ade20 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -198,6 +198,8 @@ exit_idle: start_critical_timings(); } +DEFINE_PER_CPU(bool, cpu_dead_idle); + /* * Generic idle loop implementation * @@ -222,8 +224,11 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) + if (cpu_is_offline(smp_processor_id())) { + smp_mb(); /* all activity before dead. */ + this_cpu_write(cpu_dead_idle, true); arch_cpu_idle_dead(); + } local_irq_disable(); arch_cpu_idle_enter(); -- cgit v1.3-8-gc7d7 From 88428cc5c27c63a4313e213813bc39b9899224d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Jan 2015 14:42:09 -0800 Subject: rcu: Handle outgoing CPUs on exit from idle loop This commit informs RCU of an outgoing CPU just before that CPU invokes arch_cpu_idle_dead() during its last pass through the idle loop (via a new CPU_DYING_IDLE notifier value). This change means that RCU need not deal with outgoing CPUs passing through the scheduler after informing RCU that they are no longer online. Note that removing the CPU from the rcu_node ->qsmaskinit bit masks is done at CPU_DYING_IDLE time, and orphaning callbacks is still done at CPU_DEAD time, the reason being that at CPU_DEAD time we have another CPU that can adopt them. Signed-off-by: Paul E. McKenney --- include/linux/cpu.h | 2 ++ include/linux/rcupdate.h | 2 ++ kernel/rcu/tree.c | 41 +++++++++++++++++++++++++++++++---------- kernel/sched/idle.c | 2 ++ 4 files changed, 37 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4744ef915acd..d028721748d4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -95,6 +95,8 @@ enum { * Called on the new cpu, just before * enabling interrupts. Must not sleep, * must not fail */ +#define CPU_DYING_IDLE 0x000B /* CPU (unsigned)v dying, reached + * idle loop. */ #define CPU_BROKEN 0x000C /* CPU (unsigned)v did not die properly, * perhaps due to preemption. */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 78097491cd99..762022f07afd 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -266,6 +266,8 @@ void rcu_idle_enter(void); void rcu_idle_exit(void); void rcu_irq_enter(void); void rcu_irq_exit(void); +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu); #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 79d53399247e..d5247ed44004 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2475,6 +2475,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) } } +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave(&rnp->lock, flags); + smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore(&rnp->lock, flags); +} + /* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, @@ -2485,7 +2505,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { unsigned long flags; - unsigned long mask; struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ @@ -2498,13 +2517,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", cpu, rdp->qlen, rdp->nxtlist); @@ -2520,6 +2532,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) { } +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ +} + static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) { } @@ -3733,8 +3749,8 @@ static void rcu_prepare_cpu(int cpu) /* * Handle CPU online/offline notification events. */ -static int rcu_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +int rcu_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { long cpu = (long)hcpu; struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3760,6 +3776,11 @@ static int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; + case CPU_DYING_IDLE: + for_each_rcu_flavor(rsp) { + rcu_cleanup_dying_idle_cpu(cpu, rsp); + } + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index e99e361ade20..b0090accfb5b 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -225,6 +225,8 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { + rcu_cpu_notify(NULL, CPU_DYING_IDLE, + (void *)(long)smp_processor_id()); smp_mb(); /* all activity before dead. */ this_cpu_write(cpu_dead_idle, true); arch_cpu_idle_dead(); -- cgit v1.3-8-gc7d7 From 5c60d25fa1b22fdcf141f8006d31c32b08db7311 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 9 Feb 2015 05:37:47 -0800 Subject: rcu: Add diagnostics to grace-period cleanup At grace-period initialization time, RCU checks that all quiescent states were really reported for the previous grace period. Now that grace-period cleanup has been split out of grace-period initialization, this commit also performs those checks at grace-period cleanup time. Signed-off-by: Paul E. McKenney --- kernel/rcu/tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d5247ed44004..17b5abf999ca 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1920,6 +1920,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) rcu_for_each_node_breadth_first(rsp, rnp) { raw_spin_lock_irq(&rnp->lock); smp_mb__after_unlock_lock(); + WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); + WARN_ON_ONCE(rnp->qsmask); ACCESS_ONCE(rnp->completed) = rsp->gpnum; rdp = this_cpu_ptr(rsp->rda); if (rnp == rdp->mynode) -- cgit v1.3-8-gc7d7 From a5af5aa8b67dfdba36c853b70564fd2dfe73d478 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 12 Mar 2015 16:26:11 -0700 Subject: kasan, module, vmalloc: rework shadow allocation for modules Current approach in handling shadow memory for modules is broken. Shadow memory could be freed only after memory shadow corresponds it is no longer used. vfree() called from interrupt context could use memory its freeing to store 'struct llist_node' in it: void vfree(const void *addr) { ... if (unlikely(in_interrupt())) { struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); Later this list node used in free_work() which actually frees memory. Currently module_memfree() called in interrupt context will free shadow before freeing module's memory which could provoke kernel crash. So shadow memory should be freed after module's memory. However, such deallocation order could race with kasan_module_alloc() in module_alloc(). Free shadow right before releasing vm area. At this point vfree()'d memory is not used anymore and yet not available for other allocations. New VM_KASAN flag used to indicate that vm area has dynamically allocated shadow memory so kasan frees shadow only if it was previously allocated. Signed-off-by: Andrey Ryabinin Acked-by: Rusty Russell Cc: Dmitry Vyukov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 5 +++-- include/linux/vmalloc.h | 1 + kernel/module.c | 2 -- mm/kasan/kasan.c | 14 +++++++++++--- mm/vmalloc.c | 1 + 5 files changed, 16 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 72ba725ddf9c..5fa48a21d73e 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -5,6 +5,7 @@ struct kmem_cache; struct page; +struct vm_struct; #ifdef CONFIG_KASAN @@ -52,7 +53,7 @@ void kasan_slab_free(struct kmem_cache *s, void *object); #define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT) int kasan_module_alloc(void *addr, size_t size); -void kasan_module_free(void *addr); +void kasan_free_shadow(const struct vm_struct *vm); #else /* CONFIG_KASAN */ @@ -82,7 +83,7 @@ static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {} static inline void kasan_slab_free(struct kmem_cache *s, void *object) {} static inline int kasan_module_alloc(void *addr, size_t size) { return 0; } -static inline void kasan_module_free(void *addr) {} +static inline void kasan_free_shadow(const struct vm_struct *vm) {} #endif /* CONFIG_KASAN */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 7d7acb35603d..0ec598381f97 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -17,6 +17,7 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */ #define VM_VPAGES 0x00000010 /* buffer for pages was vmalloc'ed */ #define VM_UNINITIALIZED 0x00000020 /* vm_struct is not fully initialized */ #define VM_NO_GUARD 0x00000040 /* don't add guard page */ +#define VM_KASAN 0x00000080 /* has allocated kasan shadow memory */ /* bits [20..32] reserved for arch specific ioremap internals */ /* diff --git a/kernel/module.c b/kernel/module.c index cc93cf68653c..b3d634ed06c9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -56,7 +56,6 @@ #include #include #include -#include #include #include #include @@ -1814,7 +1813,6 @@ static void unset_module_init_ro_nx(struct module *mod) { } void __weak module_memfree(void *module_region) { vfree(module_region); - kasan_module_free(module_region); } void __weak module_arch_cleanup(struct module *mod) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 78fee632a7ee..936d81661c47 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "kasan.h" @@ -414,12 +415,19 @@ int kasan_module_alloc(void *addr, size_t size) GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); - return ret ? 0 : -ENOMEM; + + if (ret) { + find_vm_area(addr)->flags |= VM_KASAN; + return 0; + } + + return -ENOMEM; } -void kasan_module_free(void *addr) +void kasan_free_shadow(const struct vm_struct *vm) { - vfree(kasan_mem_to_shadow(addr)); + if (vm->flags & VM_KASAN) + vfree(kasan_mem_to_shadow(vm->addr)); } static void register_global(struct kasan_global *global) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 35b25e1340ca..49abccf29a29 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1418,6 +1418,7 @@ struct vm_struct *remove_vm_area(const void *addr) spin_unlock(&vmap_area_lock); vmap_debug_free_range(va->va_start, va->va_end); + kasan_free_shadow(vm); free_unmap_vmap_area(va); vm->size -= PAGE_SIZE; -- cgit v1.3-8-gc7d7 From 66ee59af630fd8d5f4f56fb28162857e629aa0ab Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 11 Feb 2015 19:56:46 +0100 Subject: fs: remove ki_nbytes There is no need to pass the total request length in the kiocb, as we already get passed in through the iov_iter argument. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/aio.c | 34 ++++++++++++++++++---------------- fs/ceph/file.c | 2 +- fs/nfs/direct.c | 2 +- fs/ocfs2/file.c | 8 +++----- fs/read_write.c | 8 -------- fs/udf/file.c | 2 +- include/linux/aio.h | 1 - kernel/printk/printk.c | 2 +- mm/page_io.c | 1 - net/socket.c | 6 +++--- 10 files changed, 28 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/fs/aio.c b/fs/aio.c index 118a2e0088d8..667054c7c067 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1344,12 +1344,13 @@ typedef ssize_t (rw_iter_op)(struct kiocb *, struct iov_iter *); static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, int rw, char __user *buf, unsigned long *nr_segs, + size_t *len, struct iovec **iovec, bool compat) { ssize_t ret; - *nr_segs = kiocb->ki_nbytes; + *nr_segs = *len; #ifdef CONFIG_COMPAT if (compat) @@ -1364,21 +1365,22 @@ static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb, if (ret < 0) return ret; - /* ki_nbytes now reflect bytes instead of segs */ - kiocb->ki_nbytes = ret; + /* len now reflect bytes instead of segs */ + *len = ret; return 0; } static ssize_t aio_setup_single_vector(struct kiocb *kiocb, int rw, char __user *buf, unsigned long *nr_segs, + size_t len, struct iovec *iovec) { - if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes))) + if (unlikely(!access_ok(!rw, buf, len))) return -EFAULT; iovec->iov_base = buf; - iovec->iov_len = kiocb->ki_nbytes; + iovec->iov_len = len; *nr_segs = 1; return 0; } @@ -1388,7 +1390,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb, * Performs the initial checks and io submission. */ static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode, - char __user *buf, bool compat) + char __user *buf, size_t len, bool compat) { struct file *file = req->ki_filp; ssize_t ret; @@ -1423,21 +1425,21 @@ rw_common: if (!rw_op && !iter_op) return -EINVAL; - ret = (opcode == IOCB_CMD_PREADV || - opcode == IOCB_CMD_PWRITEV) - ? aio_setup_vectored_rw(req, rw, buf, &nr_segs, - &iovec, compat) - : aio_setup_single_vector(req, rw, buf, &nr_segs, - iovec); + if (opcode == IOCB_CMD_PREADV || opcode == IOCB_CMD_PWRITEV) + ret = aio_setup_vectored_rw(req, rw, buf, &nr_segs, + &len, &iovec, compat); + else + ret = aio_setup_single_vector(req, rw, buf, &nr_segs, + len, iovec); if (!ret) - ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes); + ret = rw_verify_area(rw, file, &req->ki_pos, len); if (ret < 0) { if (iovec != inline_vecs) kfree(iovec); return ret; } - req->ki_nbytes = ret; + len = ret; /* XXX: move/kill - rw_verify_area()? */ /* This matches the pread()/pwrite() logic */ @@ -1450,7 +1452,7 @@ rw_common: file_start_write(file); if (iter_op) { - iov_iter_init(&iter, rw, iovec, nr_segs, req->ki_nbytes); + iov_iter_init(&iter, rw, iovec, nr_segs, len); ret = iter_op(req, &iter); } else { ret = rw_op(req, iovec, nr_segs, req->ki_pos); @@ -1553,10 +1555,10 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_obj.user = user_iocb; req->ki_user_data = iocb->aio_data; req->ki_pos = iocb->aio_offset; - req->ki_nbytes = iocb->aio_nbytes; ret = aio_run_iocb(req, iocb->aio_lio_opcode, (char __user *)(unsigned long)iocb->aio_buf, + iocb->aio_nbytes, compat); if (ret) goto out_put_req; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 905986dd4c3c..081c4e3f9e49 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -807,7 +807,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *filp = iocb->ki_filp; struct ceph_file_info *fi = filp->private_data; - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct page *pinned_page = NULL; diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7077521acdf4..27cebf164070 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -265,7 +265,7 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t return -EINVAL; #else - VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); + VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE); if (rw == READ) return nfs_file_direct_read(iocb, iter, pos); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 46e0d4e857c7..266845de2100 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2280,7 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ - if (iocb->ki_nbytes == 0) + if (count == 0) return 0; appending = file->f_flags & O_APPEND ? 1 : 0; @@ -2330,8 +2330,7 @@ relock: } can_do_direct = direct_io; - ret = ocfs2_prepare_inode_for_write(file, ppos, - iocb->ki_nbytes, appending, + ret = ocfs2_prepare_inode_for_write(file, ppos, count, appending, &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); @@ -2339,8 +2338,7 @@ relock: } if (direct_io && !is_sync_kiocb(iocb)) - unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes, - *ppos); + unaligned_dio = ocfs2_is_io_unaligned(inode, count, *ppos); /* * We can't complete the direct I/O as requested, fall back to diff --git a/fs/read_write.c b/fs/read_write.c index 8e1b68786d66..f8b8fc1316ab 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -343,7 +343,6 @@ ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= READ; ret = file->f_op->read_iter(&kiocb, iter); @@ -366,7 +365,6 @@ ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos) init_sync_kiocb(&kiocb, file); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = iov_iter_count(iter); iter->type |= WRITE; ret = file->f_op->write_iter(&kiocb, iter); @@ -426,7 +424,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) @@ -446,7 +443,6 @@ ssize_t new_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *p init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, READ, &iov, 1, len); ret = filp->f_op->read_iter(&kiocb, &iter); @@ -510,7 +506,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos); if (-EIOCBQUEUED == ret) @@ -530,7 +525,6 @@ ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, lo init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, WRITE, &iov, 1, len); ret = filp->f_op->write_iter(&kiocb, &iter); @@ -719,7 +713,6 @@ static ssize_t do_iter_readv_writev(struct file *filp, int rw, const struct iove init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; iov_iter_init(&iter, rw, iov, nr_segs, len); ret = fn(&kiocb, &iter); @@ -737,7 +730,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov, init_sync_kiocb(&kiocb, filp); kiocb.ki_pos = *ppos; - kiocb.ki_nbytes = len; ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos); if (ret == -EIOCBQUEUED) diff --git a/fs/udf/file.c b/fs/udf/file.c index 08f3555fbeac..9c0b6da9dbb3 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); int err, pos; - size_t count = iocb->ki_nbytes; + size_t count = iov_iter_count(from); struct udf_inode_info *iinfo = UDF_I(inode); mutex_lock(&inode->i_mutex); diff --git a/include/linux/aio.h b/include/linux/aio.h index d9c92daa3944..132d1ecba435 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -42,7 +42,6 @@ struct kiocb { __u64 ki_user_data; /* user's data for completion */ loff_t ki_pos; - size_t ki_nbytes; /* copy of iocb->aio_nbytes */ struct list_head ki_list; /* the aio core uses this * for cancellation */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c06df7de0963..60b2aa2a2da9 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -521,7 +521,7 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ - size_t len = iocb->ki_nbytes; + size_t len = iov_iter_count(from); ssize_t ret = len; if (len > LOG_LINE_MAX) diff --git a/mm/page_io.c b/mm/page_io.c index e6045804c8d8..7ef21577856c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -274,7 +274,6 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE); init_sync_kiocb(&kiocb, swap_file); kiocb.ki_pos = page_file_offset(page); - kiocb.ki_nbytes = PAGE_SIZE; set_page_writeback(page); unlock_page(page); diff --git a/net/socket.c b/net/socket.c index bbedbfcb42c2..f92145554f34 100644 --- a/net/socket.c +++ b/net/socket.c @@ -858,11 +858,11 @@ static ssize_t sock_read_iter(struct kiocb *iocb, struct iov_iter *to) if (iocb->ki_pos != 0) return -ESPIPE; - if (iocb->ki_nbytes == 0) /* Match SYS5 behaviour */ + if (!iov_iter_count(to)) /* Match SYS5 behaviour */ return 0; res = __sock_recvmsg(iocb, sock, &msg, - iocb->ki_nbytes, msg.msg_flags); + iov_iter_count(to), msg.msg_flags); *to = msg.msg_iter; return res; } @@ -883,7 +883,7 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; - res = __sock_sendmsg(iocb, sock, &msg, iocb->ki_nbytes); + res = __sock_sendmsg(iocb, sock, &msg, iov_iter_count(from)); *from = msg.msg_iter; return res; } -- cgit v1.3-8-gc7d7 From 3c17ad19f0697ffe5ef7438cdafc2d2b7757d8a5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:32 -0700 Subject: timekeeping: Add debugging checks to warn if we see delays Recently there's been requests for better sanity checking in the time code, so that it's more clear when something is going wrong, since timekeeping issues could manifest in a large number of strange ways in various subsystems. Thus, this patch adds some extra infrastructure to add a check to update_wall_time() to print two new warnings: 1) if we see the call delayed beyond the 'max_cycles' overflow point, 2) or if we see the call delayed beyond the clocksource's 'max_idle_ns' value, which is currently 50% of the overflow point. This extra infrastructure is conditional on a new CONFIG_DEBUG_TIMEKEEPING option, also added in this patch - default off. Tested this a bit by halting qemu for specified lengths of time to trigger the warnings. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-5-git-send-email-john.stultz@linaro.org [ Improved the changelog and the messages a bit. ] Signed-off-by: Ingo Molnar --- kernel/time/jiffies.c | 1 + kernel/time/timekeeping.c | 28 ++++++++++++++++++++++++++++ lib/Kconfig.debug | 13 +++++++++++++ 3 files changed, 42 insertions(+) (limited to 'kernel') diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a6a5bf53e86d..7e413902aa6a 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -71,6 +71,7 @@ static struct clocksource clocksource_jiffies = { .mask = 0xffffffff, /*32bits*/ .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, + .max_cycles = 10, }; __cacheline_aligned_in_smp DEFINE_SEQLOCK(jiffies_lock); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91db94136c10..acf049144cf6 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -118,6 +118,31 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->offs_boot = ktime_add(tk->offs_boot, delta); } +#ifdef CONFIG_DEBUG_TIMEKEEPING +static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ + + cycle_t max_cycles = tk->tkr.clock->max_cycles; + const char *name = tk->tkr.clock->name; + + if (offset > max_cycles) { + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow\n", + offset, name, max_cycles); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope\n"); + } else { + if (offset > (max_cycles >> 1)) { + printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", + offset, name, max_cycles >> 1); + printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); + } + } +} +#else +static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) +{ +} +#endif + /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -1630,6 +1655,9 @@ void update_wall_time(void) if (offset < real_tk->cycle_interval) goto out; + /* Do some additional sanity checking */ + timekeeping_check_update(real_tk, offset); + /* * With NO_HZ we may have to accumulate many cycle_intervals * (think "ticks") worth of time at once. To do this efficiently, diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c5cefb3c009c..36b6fa88ce5b 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -865,6 +865,19 @@ config SCHED_STACK_END_CHECK data corruption or a sporadic crash at a later stage once the region is examined. The runtime overhead introduced is minimal. +config DEBUG_TIMEKEEPING + bool "Enable extra timekeeping sanity checking" + help + This option will enable additional timekeeping sanity checks + which may be helpful when diagnosing issues where timekeeping + problems are suspected. + + This may include checks in the timekeeping hotpaths, so this + option may have a (very small) performance impact to some + workloads. + + If unsure, say N. + config TIMER_STATS bool "Collect kernel timers statistics" depends on DEBUG_KERNEL && PROC_FS -- cgit v1.3-8-gc7d7 From a558cd021d83b65c47ee5b9bec1fcfe5298a769f Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:33 -0700 Subject: timekeeping: Add checks to cap clocksource reads to the 'max_cycles' value When calculating the current delta since the last tick, we currently have no hard protections to prevent a multiplication overflow from occuring. This patch introduces infrastructure to allow a cap that limits the clocksource read delta value to the 'max_cycles' value, which is where an overflow would occur. Since this is in the hotpath, it adds the extra checking under CONFIG_DEBUG_TIMEKEEPING=y. There was some concern that capping time like this could cause problems as we may stop expiring timers, which could go circular if the timer that triggers time accumulation were mis-scheduled too far in the future, which would cause time to stop. However, since the mult overflow would result in a smaller time value, we would effectively have the same problem there. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-6-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 49 +++++++++++++++++++++++++++++++++-------------- 1 file changed, 35 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index acf049144cf6..657414cf2e46 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -126,9 +126,9 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) const char *name = tk->tkr.clock->name; if (offset > max_cycles) { - printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow\n", + printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", offset, name, max_cycles); - printk_deferred(" timekeeping: Your kernel is sick, but tries to cope\n"); + printk_deferred(" timekeeping: Your kernel is sick, but tries to cope by capping time updates\n"); } else { if (offset > (max_cycles >> 1)) { printk_deferred("INFO: timekeeping: Cycle offset (%lld) is larger than the the '%s' clock's 50%% safety margin (%lld)\n", @@ -137,10 +137,39 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) } } } + +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t cycle_now, delta; + + /* read clocksource */ + cycle_now = tkr->read(tkr->clock); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + /* Cap delta value to the max_cycles values to avoid mult overflows */ + if (unlikely(delta > tkr->clock->max_cycles)) + delta = tkr->clock->max_cycles; + + return delta; +} #else static inline void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { } +static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) +{ + cycle_t cycle_now, delta; + + /* read clocksource */ + cycle_now = tkr->read(tkr->clock); + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + + return delta; +} #endif /** @@ -218,14 +247,10 @@ static inline u32 arch_gettimeoffset(void) { return 0; } static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) { - cycle_t cycle_now, delta; + cycle_t delta; s64 nsec; - /* read clocksource: */ - cycle_now = tkr->read(tkr->clock); - - /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + delta = timekeeping_get_delta(tkr); nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; @@ -237,14 +262,10 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { struct clocksource *clock = tk->tkr.clock; - cycle_t cycle_now, delta; + cycle_t delta; s64 nsec; - /* read clocksource: */ - cycle_now = tk->tkr.read(clock); - - /* calculate the delta since the last update_wall_time: */ - delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); + delta = timekeeping_get_delta(&tk->tkr); /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); -- cgit v1.3-8-gc7d7 From 057b87e3161d1194a095718f9918c01b2c389e74 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:34 -0700 Subject: timekeeping: Try to catch clocksource delta underflows In the case where there is a broken clocksource where there are multiple actual clocks that aren't perfectly aligned, we may see small "negative" deltas when we subtract 'now' from 'cycle_last'. The values are actually negative with respect to the clocksource mask value, not necessarily negative if cast to a s64, but we can check by checking the delta to see if it is a small (relative to the mask) negative value (again negative relative to the mask). If so, we assume we jumped backwards somehow and instead use zero for our delta. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-7-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 657414cf2e46..187149be83ea 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -148,6 +148,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) /* calculate the delta since the last update_wall_time */ delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + /* + * Try to catch underflows by checking if we are seeing small + * mask-relative negative values. + */ + if (unlikely((~delta & tkr->mask) < (tkr->mask >> 3))) + delta = 0; + /* Cap delta value to the max_cycles values to avoid mult overflows */ if (unlikely(delta > tkr->clock->max_cycles)) delta = tkr->clock->max_cycles; -- cgit v1.3-8-gc7d7 From 4ca22c2648f9c1cec0b242f58d7302136f5a4cbb Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:35 -0700 Subject: timekeeping: Add warnings when overflows or underflows are observed It was suggested that the underflow/overflow protection should probably throw some sort of warning out, rather than just silently fixing the issue. So this patch adds some warnings here. The flag variables used are not protected by locks, but since we can't print from the reading functions, just being able to say we saw an issue in the update interval is useful enough, and can be slightly racy without real consequence. The big complication is that we're only under a read seqlock, so the data could shift under us during our calculation to see if there was a problem. This patch avoids this issue by nesting another seqlock which allows us to snapshot the just required values atomically. So we shouldn't see false positives. I also added some basic rate-limiting here, since on one build machine w/ skewed TSCs it was fairly noisy at bootup. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-8-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 64 +++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 57 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 187149be83ea..892f6cbf1e67 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -119,6 +119,20 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) } #ifdef CONFIG_DEBUG_TIMEKEEPING +#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */ +/* + * These simple flag variables are managed + * without locks, which is racy, but ok since + * we don't really care about being super + * precise about how many events were seen, + * just that a problem was observed. + */ +static int timekeeping_underflow_seen; +static int timekeeping_overflow_seen; + +/* last_warning is only modified under the timekeeping lock */ +static long timekeeping_last_warning; + static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { @@ -136,28 +150,64 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) printk_deferred(" timekeeping: Your kernel is still fine, but is feeling a bit nervous\n"); } } + + if (timekeeping_underflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_underflow_seen = 0; + } + + if (timekeeping_overflow_seen) { + if (jiffies - timekeeping_last_warning > WARNING_FREQ) { + printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name); + printk_deferred(" Please report this, consider using a different clocksource, if possible.\n"); + printk_deferred(" Your kernel is probably still fine.\n"); + timekeeping_last_warning = jiffies; + } + timekeeping_overflow_seen = 0; + } } static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr) { - cycle_t cycle_now, delta; + cycle_t now, last, mask, max, delta; + unsigned int seq; - /* read clocksource */ - cycle_now = tkr->read(tkr->clock); + /* + * Since we're called holding a seqlock, the data may shift + * under us while we're doing the calculation. This can cause + * false positives, since we'd note a problem but throw the + * results away. So nest another seqlock here to atomically + * grab the points we are checking with. + */ + do { + seq = read_seqcount_begin(&tk_core.seq); + now = tkr->read(tkr->clock); + last = tkr->cycle_last; + mask = tkr->mask; + max = tkr->clock->max_cycles; + } while (read_seqcount_retry(&tk_core.seq, seq)); - /* calculate the delta since the last update_wall_time */ - delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask); + delta = clocksource_delta(now, last, mask); /* * Try to catch underflows by checking if we are seeing small * mask-relative negative values. */ - if (unlikely((~delta & tkr->mask) < (tkr->mask >> 3))) + if (unlikely((~delta & mask) < (mask >> 3))) { + timekeeping_underflow_seen = 1; delta = 0; + } /* Cap delta value to the max_cycles values to avoid mult overflows */ - if (unlikely(delta > tkr->clock->max_cycles)) + if (unlikely(delta > max)) { + timekeeping_overflow_seen = 1; delta = tkr->clock->max_cycles; + } return delta; } -- cgit v1.3-8-gc7d7 From 0b046b217ad4c64fbbeaaac24d0648cb1fa49ad8 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:36 -0700 Subject: clocksource: Improve clocksource watchdog reporting The clocksource watchdog reporting has been less helpful then desired, as it just printed the delta between the two clocksources. This prevents any useful analysis of why the skew occurred. Thus this patch tries to improve the output when we mark a clocksource as unstable, printing out the cycle last and now values for both the current clocksource and the watchdog clocksource. This will allow us to see if the result was due to a false positive caused by a problematic watchdog. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-9-git-send-email-john.stultz@linaro.org [ Minor cleanups of kernel messages. ] Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index fc2a9de43ca1..c4cc04bec698 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -142,13 +142,6 @@ static void __clocksource_unstable(struct clocksource *cs) schedule_work(&watchdog_work); } -static void clocksource_unstable(struct clocksource *cs, int64_t delta) -{ - printk(KERN_WARNING "Clocksource %s unstable (delta = %Ld ns)\n", - cs->name, delta); - __clocksource_unstable(cs); -} - /** * clocksource_mark_unstable - mark clocksource unstable via watchdog * @cs: clocksource to be marked unstable @@ -174,7 +167,7 @@ void clocksource_mark_unstable(struct clocksource *cs) static void clocksource_watchdog(unsigned long data) { struct clocksource *cs; - cycle_t csnow, wdnow, delta; + cycle_t csnow, wdnow, cslast, wdlast, delta; int64_t wd_nsec, cs_nsec; int next_cpu, reset_pending; @@ -213,6 +206,8 @@ static void clocksource_watchdog(unsigned long data) delta = clocksource_delta(csnow, cs->cs_last, cs->mask); cs_nsec = clocksource_cyc2ns(delta, cs->mult, cs->shift); + wdlast = cs->wd_last; /* save these in case we print them */ + cslast = cs->cs_last; cs->cs_last = csnow; cs->wd_last = wdnow; @@ -221,7 +216,12 @@ static void clocksource_watchdog(unsigned long data) /* Check the deviation from the watchdog clocksource. */ if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) { - clocksource_unstable(cs, cs_nsec - wd_nsec); + pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name); + pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", + watchdog->name, wdnow, wdlast, watchdog->mask); + pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", + cs->name, csnow, cslast, cs->mask); + __clocksource_unstable(cs); continue; } -- cgit v1.3-8-gc7d7 From f8935983f110505daa38e8d36ee406807f83a069 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:37 -0700 Subject: clocksource: Mostly kill clocksource_register() A long running project has been to clean up remaining uses of clocksource_register(), replacing it with the simpler clocksource_register_khz/hz() functions. However, there are a few cases where we need to self-define our mult/shift values, so switch the function to a more obviously internal __clocksource_register() name, and consolidate much of the internal logic so we don't have duplication. Signed-off-by: John Stultz Cc: Dave Jones Cc: David S. Miller Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-10-git-send-email-john.stultz@linaro.org [ Minor cleanups. ] Signed-off-by: Ingo Molnar --- arch/s390/kernel/time.c | 2 +- arch/sparc/kernel/time_32.c | 2 +- include/linux/clocksource.h | 10 +++++- kernel/time/clocksource.c | 81 +++++++++++++++++++-------------------------- kernel/time/jiffies.c | 4 +-- 5 files changed, 47 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 20660dddb2d6..6c273cd815bb 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -283,7 +283,7 @@ void __init time_init(void) if (register_external_irq(EXT_IRQ_TIMING_ALERT, timing_alert_interrupt)) panic("Couldn't request external interrupt 0x1406"); - if (clocksource_register(&clocksource_tod) != 0) + if (__clocksource_register(&clocksource_tod) != 0) panic("Could not register TOD clock source"); /* Enable TOD clock interrupts on the boot cpu. */ diff --git a/arch/sparc/kernel/time_32.c b/arch/sparc/kernel/time_32.c index 2f80d23a0a44..a31c0c822beb 100644 --- a/arch/sparc/kernel/time_32.c +++ b/arch/sparc/kernel/time_32.c @@ -191,7 +191,7 @@ static __init int setup_timer_cs(void) timer_cs.mult = clocksource_hz2mult(sparc_config.clock_rate, timer_cs.shift); - return clocksource_register(&timer_cs); + return __clocksource_register(&timer_cs); } #ifdef CONFIG_SMP diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 16d048cadebb..bd98eaa4d005 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -179,7 +179,6 @@ static inline s64 clocksource_cyc2ns(cycle_t cycles, u32 mult, u32 shift) } -extern int clocksource_register(struct clocksource*); extern int clocksource_unregister(struct clocksource*); extern void clocksource_touch_watchdog(void); extern struct clocksource* clocksource_get_next(void); @@ -203,6 +202,15 @@ __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq); extern void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq); +/* + * Don't call this unless you are a default clocksource + * (AKA: jiffies) and absolutely have to. + */ +static inline int __clocksource_register(struct clocksource *cs) +{ + return __clocksource_register_scale(cs, 1, 0); +} + static inline int clocksource_register_hz(struct clocksource *cs, u32 hz) { return __clocksource_register_scale(cs, 1, hz); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c4cc04bec698..5cdf17eb4fa6 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -656,38 +656,52 @@ static void clocksource_enqueue(struct clocksource *cs) void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; + /* - * Calc the maximum number of seconds which we can run before - * wrapping around. For clocksources which have a mask > 32bit - * we need to limit the max sleep time to have a good - * conversion precision. 10 minutes is still a reasonable - * amount. That results in a shift value of 24 for a - * clocksource with mask >= 40bit and f >= 4GHz. That maps to - * ~ 0.06ppm granularity for NTP. + * Default clocksources are *special* and self-define their mult/shift. + * But, you're not special, so you should specify a freq value. */ - sec = cs->mask; - do_div(sec, freq); - do_div(sec, scale); - if (!sec) - sec = 1; - else if (sec > 600 && cs->mask > UINT_MAX) - sec = 600; - - clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, - NSEC_PER_SEC / scale, sec * scale); - + if (freq) { + /* + * Calc the maximum number of seconds which we can run before + * wrapping around. For clocksources which have a mask > 32-bit + * we need to limit the max sleep time to have a good + * conversion precision. 10 minutes is still a reasonable + * amount. That results in a shift value of 24 for a + * clocksource with mask >= 40-bit and f >= 4GHz. That maps to + * ~ 0.06ppm granularity for NTP. + */ + sec = cs->mask; + do_div(sec, freq); + do_div(sec, scale); + if (!sec) + sec = 1; + else if (sec > 600 && cs->mask > UINT_MAX) + sec = 600; + + clocks_calc_mult_shift(&cs->mult, &cs->shift, freq, + NSEC_PER_SEC / scale, sec * scale); + } /* * Ensure clocksources that have large 'mult' values don't overflow * when adjusted. */ cs->maxadj = clocksource_max_adjustment(cs); - while ((cs->mult + cs->maxadj < cs->mult) - || (cs->mult - cs->maxadj > cs->mult)) { + while (freq && ((cs->mult + cs->maxadj < cs->mult) + || (cs->mult - cs->maxadj > cs->mult))) { cs->mult >>= 1; cs->shift--; cs->maxadj = clocksource_max_adjustment(cs); } + /* + * Only warn for *special* clocksources that self-define + * their mult/shift values and don't specify a freq. + */ + WARN_ONCE(cs->mult + cs->maxadj < cs->mult, + "timekeeping: Clocksource %s might overflow on 11%% adjustment\n", + cs->name); + clocksource_update_max_deferment(cs); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); @@ -719,33 +733,6 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) } EXPORT_SYMBOL_GPL(__clocksource_register_scale); - -/** - * clocksource_register - Used to install new clocksources - * @cs: clocksource to be registered - * - * Returns -EBUSY if registration fails, zero otherwise. - */ -int clocksource_register(struct clocksource *cs) -{ - /* calculate max adjustment for given mult/shift */ - cs->maxadj = clocksource_max_adjustment(cs); - WARN_ONCE(cs->mult + cs->maxadj < cs->mult, - "Clocksource %s might overflow on 11%% adjustment\n", - cs->name); - - /* Update max idle time permitted for this clocksource */ - clocksource_update_max_deferment(cs); - - mutex_lock(&clocksource_mutex); - clocksource_enqueue(cs); - clocksource_enqueue_watchdog(cs); - clocksource_select(); - mutex_unlock(&clocksource_mutex); - return 0; -} -EXPORT_SYMBOL(clocksource_register); - static void __clocksource_change_rating(struct clocksource *cs, int rating) { list_del(&cs->list); diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 7e413902aa6a..c4bb518725b5 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -95,7 +95,7 @@ EXPORT_SYMBOL(jiffies); static int __init init_jiffies_clocksource(void) { - return clocksource_register(&clocksource_jiffies); + return __clocksource_register(&clocksource_jiffies); } core_initcall(init_jiffies_clocksource); @@ -131,6 +131,6 @@ int register_refined_jiffies(long cycles_per_second) refined_jiffies.mult = ((u32)nsec_per_tick) << JIFFIES_SHIFT; - clocksource_register(&refined_jiffies); + __clocksource_register(&refined_jiffies); return 0; } -- cgit v1.3-8-gc7d7 From 8cc8c525ad4e7b581cacf84119e1a28dcb4044db Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:39 -0700 Subject: clocksource: Add some debug info about clocksources being registered Print the mask, max_cycles, and max_idle_ns values for clocksources being registered. Signed-off-by: John Stultz Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-12-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 5cdf17eb4fa6..1977ebabd922 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -703,6 +703,9 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) cs->name); clocksource_update_max_deferment(cs); + + pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", + cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); -- cgit v1.3-8-gc7d7 From fba9e07208c0f9d92d9f73761c99c8612039da44 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 11 Mar 2015 21:16:40 -0700 Subject: clocksource: Rename __clocksource_updatefreq_*() to __clocksource_update_freq_*() Ingo requested this function be renamed to improve readability, so I've renamed __clocksource_updatefreq_scale() as well as the __clocksource_updatefreq_hz/khz() functions to avoid squishedtogethernames. This touches some of the sh clocksources, which I've not tested. The arch/arm/plat-omap change is just a comment change for consistency. Signed-off-by: John Stultz Cc: Daniel Lezcano Cc: Dave Jones Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Stephen Boyd Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1426133800-29329-13-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- arch/arm/plat-omap/counter_32k.c | 2 +- drivers/clocksource/em_sti.c | 2 +- drivers/clocksource/sh_cmt.c | 2 +- drivers/clocksource/sh_tmu.c | 2 +- include/linux/clocksource.h | 10 +++++----- kernel/time/clocksource.c | 11 ++++++----- 6 files changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/arm/plat-omap/counter_32k.c b/arch/arm/plat-omap/counter_32k.c index 61b4d705c267..43cf74561cfd 100644 --- a/arch/arm/plat-omap/counter_32k.c +++ b/arch/arm/plat-omap/counter_32k.c @@ -103,7 +103,7 @@ int __init omap_init_clocksource_32k(void __iomem *vbase) /* * 120000 rough estimate from the calculations in - * __clocksource_updatefreq_scale. + * __clocksource_update_freq_scale. */ clocks_calc_mult_shift(&persistent_mult, &persistent_shift, 32768, NSEC_PER_SEC, 120000); diff --git a/drivers/clocksource/em_sti.c b/drivers/clocksource/em_sti.c index d0a7bd66b8b9..dc3c6ee04aaa 100644 --- a/drivers/clocksource/em_sti.c +++ b/drivers/clocksource/em_sti.c @@ -210,7 +210,7 @@ static int em_sti_clocksource_enable(struct clocksource *cs) ret = em_sti_start(p, USER_CLOCKSOURCE); if (!ret) - __clocksource_updatefreq_hz(cs, p->rate); + __clocksource_update_freq_hz(cs, p->rate); return ret; } diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 2bd13b53b727..b8ff3c64cc45 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -641,7 +641,7 @@ static int sh_cmt_clocksource_enable(struct clocksource *cs) ret = sh_cmt_start(ch, FLAG_CLOCKSOURCE); if (!ret) { - __clocksource_updatefreq_hz(cs, ch->rate); + __clocksource_update_freq_hz(cs, ch->rate); ch->cs_enabled = true; } return ret; diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c index f150ca82bfaf..b6b8fa3cd211 100644 --- a/drivers/clocksource/sh_tmu.c +++ b/drivers/clocksource/sh_tmu.c @@ -272,7 +272,7 @@ static int sh_tmu_clocksource_enable(struct clocksource *cs) ret = sh_tmu_enable(ch); if (!ret) { - __clocksource_updatefreq_hz(cs, ch->rate); + __clocksource_update_freq_hz(cs, ch->rate); ch->cs_enabled = true; } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index bd98eaa4d005..135509821c39 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -200,7 +200,7 @@ clocks_calc_mult_shift(u32 *mult, u32 *shift, u32 from, u32 to, u32 minsec); extern int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq); extern void -__clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq); +__clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq); /* * Don't call this unless you are a default clocksource @@ -221,14 +221,14 @@ static inline int clocksource_register_khz(struct clocksource *cs, u32 khz) return __clocksource_register_scale(cs, 1000, khz); } -static inline void __clocksource_updatefreq_hz(struct clocksource *cs, u32 hz) +static inline void __clocksource_update_freq_hz(struct clocksource *cs, u32 hz) { - __clocksource_updatefreq_scale(cs, 1, hz); + __clocksource_update_freq_scale(cs, 1, hz); } -static inline void __clocksource_updatefreq_khz(struct clocksource *cs, u32 khz) +static inline void __clocksource_update_freq_khz(struct clocksource *cs, u32 khz) { - __clocksource_updatefreq_scale(cs, 1000, khz); + __clocksource_update_freq_scale(cs, 1000, khz); } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1977ebabd922..c3be3c71bbad 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -643,7 +643,7 @@ static void clocksource_enqueue(struct clocksource *cs) } /** - * __clocksource_updatefreq_scale - Used update clocksource with new freq + * __clocksource_update_freq_scale - Used update clocksource with new freq * @cs: clocksource to be registered * @scale: Scale factor multiplied against freq to get clocksource hz * @freq: clocksource frequency (cycles per second) divided by scale @@ -651,9 +651,10 @@ static void clocksource_enqueue(struct clocksource *cs) * This should only be called from the clocksource->enable() method. * * This *SHOULD NOT* be called directly! Please use the - * clocksource_updatefreq_hz() or clocksource_updatefreq_khz helper functions. + * __clocksource_update_freq_hz() or __clocksource_update_freq_khz() helper + * functions. */ -void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) +void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq) { u64 sec; @@ -707,7 +708,7 @@ void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq) pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n", cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns); } -EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale); +EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale); /** * __clocksource_register_scale - Used to install new clocksources @@ -724,7 +725,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) { /* Initialize mult/shift and max_idle_ns */ - __clocksource_updatefreq_scale(cs, scale, freq); + __clocksource_update_freq_scale(cs, scale, freq); /* Add clocksource to the clocksource list */ mutex_lock(&clocksource_mutex); -- cgit v1.3-8-gc7d7 From d415a7f1c1a8406b22d95b943c66a5b73a37bc19 Mon Sep 17 00:00:00 2001 From: Leon Yu Date: Thu, 26 Feb 2015 20:43:33 +0800 Subject: perf: Fix context leak in put_event() Commit: a83fe28e2e45 ("perf: Fix put_event() ctx lock") changed the locking logic in put_event() by replacing mutex_lock_nested() with perf_event_ctx_lock_nested(), but didn't fix the subsequent mutex_unlock() with a correct counterpart, perf_event_ctx_unlock(). Contexts are thus leaked as a result of incremented refcount in perf_event_ctx_lock_nested(). Signed-off-by: Leon Yu Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Peter Zijlstra Fixes: a83fe28e2e45 ("perf: Fix put_event() ctx lock") Link: http://lkml.kernel.org/r/1424954613-5034-1-git-send-email-chianglungyu@gmail.com Signed-off-by: Ingo Molnar --- kernel/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index f04daabfd1cf..453ef61311d4 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3591,7 +3591,7 @@ static void put_event(struct perf_event *event) ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); WARN_ON_ONCE(ctx->parent_ctx); perf_remove_from_context(event, true); - mutex_unlock(&ctx->mutex); + perf_event_ctx_unlock(event, ctx); _free_event(event); } -- cgit v1.3-8-gc7d7 From 08b55e2a9208e4841a17c9d9c2c454986392977d Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Wed, 11 Mar 2015 15:43:43 +0000 Subject: genirq: Add irqchip_set_wake_parent This proves to be useful with stacked domains, when the current domain doesn't implement wake-up, but expect the parent to do so. Acked-by: Tony Lindgren Signed-off-by: Marc Zyngier Link: https://lkml.kernel.org/r/1426088629-15377-2-git-send-email-marc.zyngier@arm.com Signed-off-by: Jason Cooper --- include/linux/irq.h | 1 + kernel/irq/chip.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/irq.h b/include/linux/irq.h index d09ec7a1243e..3057c48e4933 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -460,6 +460,7 @@ extern void irq_chip_eoi_parent(struct irq_data *data); extern int irq_chip_set_affinity_parent(struct irq_data *data, const struct cpumask *dest, bool force); +extern int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on); #endif /* Handling of unhandled and spurious interrupts: */ diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 6f1c7a566b95..eb9a4ea394ab 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -948,6 +948,22 @@ int irq_chip_retrigger_hierarchy(struct irq_data *data) return -ENOSYS; } + +/** + * irq_chip_set_wake_parent - Set/reset wake-up on the parent interrupt + * @data: Pointer to interrupt specific data + * @on: Whether to set or reset the wake-up capability of this irq + * + * Conditional, as the underlying parent chip might not implement it. + */ +int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) +{ + data = data->parent_data; + if (data->chip->irq_set_wake) + return data->chip->irq_set_wake(data, on); + + return -ENOSYS; +} #endif /** -- cgit v1.3-8-gc7d7 From 03e69b508b6f7c51743055c9f61d1dfeadf4b635 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:16 +0100 Subject: ebpf: add prandom helper for packet sampling This work is similar to commit 4cd3675ebf74 ("filter: added BPF random opcode") and adds a possibility for packet sampling in eBPF. Currently, this is only possible in classic BPF and useful to combine sampling with f.e. packet sockets, possible also with tc. Example function proto-type looks like: u32 (*prandom_u32)(void) = (void *)BPF_FUNC_get_prandom_u32; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 2 ++ include/uapi/linux/bpf.h | 1 + kernel/bpf/core.c | 2 ++ kernel/bpf/helpers.c | 12 ++++++++++++ net/core/filter.c | 2 ++ 5 files changed, 19 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 80f2e0fc3d02..50bf95e29a96 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -154,4 +154,6 @@ extern const struct bpf_func_proto bpf_map_lookup_elem_proto; extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; +extern const struct bpf_func_proto bpf_get_prandom_u32_proto; + #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 3fa1af8a58d7..1c2ca2b477c8 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 50603aec766a..c1dbbb5d289b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -661,6 +661,8 @@ const struct bpf_func_proto bpf_map_lookup_elem_proto __weak; const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; +const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; + /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. */ diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index a3c7701a8b5e..95eb59a045ea 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -11,6 +11,7 @@ */ #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -87,3 +88,14 @@ const struct bpf_func_proto bpf_map_delete_elem_proto = { .arg1_type = ARG_CONST_MAP_PTR, .arg2_type = ARG_PTR_TO_MAP_KEY, }; + +static u64 bpf_get_prandom_u32(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return prandom_u32(); +} + +const struct bpf_func_proto bpf_get_prandom_u32_proto = { + .func = bpf_get_prandom_u32, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 7a4eb7030dba..4344db39af2e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1139,6 +1139,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_update_elem_proto; case BPF_FUNC_map_delete_elem: return &bpf_map_delete_elem_proto; + case BPF_FUNC_get_prandom_u32: + return &bpf_get_prandom_u32_proto; default: return NULL; } -- cgit v1.3-8-gc7d7 From c04167ce2ca0ecaeaafef006cb0d65cf01b68e42 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 14 Mar 2015 02:27:17 +0100 Subject: ebpf: add helper for obtaining current processor id This patch adds the possibility to obtain raw_smp_processor_id() in eBPF. Currently, this is only possible in classic BPF where commit da2033c28226 ("filter: add SKF_AD_RXHASH and SKF_AD_CPU") has added facilities for this. Perhaps most importantly, this would also allow us to track per CPU statistics with eBPF maps, or to implement a poor-man's per CPU data structure through eBPF maps. Example function proto-type looks like: u32 (*smp_processor_id)(void) = (void *)BPF_FUNC_get_smp_processor_id; Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/core.c | 1 + kernel/bpf/helpers.c | 12 ++++++++++++ net/core/filter.c | 2 ++ 5 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 50bf95e29a96..30bfd331882a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -155,5 +155,6 @@ extern const struct bpf_func_proto bpf_map_update_elem_proto; extern const struct bpf_func_proto bpf_map_delete_elem_proto; extern const struct bpf_func_proto bpf_get_prandom_u32_proto; +extern const struct bpf_func_proto bpf_get_smp_processor_id_proto; #endif /* _LINUX_BPF_H */ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1c2ca2b477c8..de1f63668daf 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ + BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1dbbb5d289b..4139a0f8b558 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -662,6 +662,7 @@ const struct bpf_func_proto bpf_map_update_elem_proto __weak; const struct bpf_func_proto bpf_map_delete_elem_proto __weak; const struct bpf_func_proto bpf_get_prandom_u32_proto __weak; +const struct bpf_func_proto bpf_get_smp_processor_id_proto __weak; /* To execute LD_ABS/LD_IND instructions __bpf_prog_run() may call * skb_copy_bits(), so provide a weak definition of it for NET-less config. diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 95eb59a045ea..bd7f5988ed9c 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -12,6 +12,7 @@ #include #include #include +#include /* If kernel subsystem is allowing eBPF programs to call this function, * inside its own verifier_ops->get_func_proto() callback it should return @@ -99,3 +100,14 @@ const struct bpf_func_proto bpf_get_prandom_u32_proto = { .gpl_only = false, .ret_type = RET_INTEGER, }; + +static u64 bpf_get_smp_processor_id(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + return raw_smp_processor_id(); +} + +const struct bpf_func_proto bpf_get_smp_processor_id_proto = { + .func = bpf_get_smp_processor_id, + .gpl_only = false, + .ret_type = RET_INTEGER, +}; diff --git a/net/core/filter.c b/net/core/filter.c index 4344db39af2e..33310eee6134 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1141,6 +1141,8 @@ sk_filter_func_proto(enum bpf_func_id func_id) return &bpf_map_delete_elem_proto; case BPF_FUNC_get_prandom_u32: return &bpf_get_prandom_u32_proto; + case BPF_FUNC_get_smp_processor_id: + return &bpf_get_smp_processor_id_proto; default: return NULL; } -- cgit v1.3-8-gc7d7 From 9bac3d6d548e5cc925570b263f35b70a00a00ffd Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Fri, 13 Mar 2015 11:57:42 -0700 Subject: bpf: allow extended BPF programs access skb fields introduce user accessible mirror of in-kernel 'struct sk_buff': struct __sk_buff { __u32 len; __u32 pkt_type; __u32 mark; __u32 queue_mapping; }; bpf programs can do: int bpf_prog(struct __sk_buff *skb) { __u32 var = skb->pkt_type; which will be compiled to bpf assembler as: dst_reg = *(u32 *)(src_reg + 4) // 4 == offsetof(struct __sk_buff, pkt_type) bpf verifier will check validity of access and will convert it to: dst_reg = *(u8 *)(src_reg + offsetof(struct sk_buff, __pkt_type_offset)) dst_reg &= 7 since skb->pkt_type is a bitfield. Signed-off-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/linux/bpf.h | 5 +- include/uapi/linux/bpf.h | 10 ++++ kernel/bpf/syscall.c | 2 +- kernel/bpf/verifier.c | 152 ++++++++++++++++++++++++++++++++++++++++++----- net/core/filter.c | 100 +++++++++++++++++++++++++------ 5 files changed, 234 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 30bfd331882a..280a315de8d6 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -103,6 +103,9 @@ struct bpf_verifier_ops { * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type); + + u32 (*convert_ctx_access)(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn); }; struct bpf_prog_type_list { @@ -133,7 +136,7 @@ struct bpf_map *bpf_map_get(struct fd f); void bpf_map_put(struct bpf_map *map); /* verify correctness of eBPF program */ -int bpf_check(struct bpf_prog *fp, union bpf_attr *attr); +int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) { diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index de1f63668daf..929545a27546 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -170,4 +170,14 @@ enum bpf_func_id { __BPF_FUNC_MAX_ID, }; +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 669719ccc9ee..ea75c654af1b 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -519,7 +519,7 @@ static int bpf_prog_load(union bpf_attr *attr) goto free_prog; /* run eBPF verifier */ - err = bpf_check(prog, attr); + err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e6b522496250..c22ebd36fa4b 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1620,11 +1620,10 @@ static int do_check(struct verifier_env *env) return err; } else if (class == BPF_LDX) { - if (BPF_MODE(insn->code) != BPF_MEM || - insn->imm != 0) { - verbose("BPF_LDX uses reserved fields\n"); - return -EINVAL; - } + enum bpf_reg_type src_reg_type; + + /* check for reserved fields is already done */ + /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) @@ -1643,6 +1642,29 @@ static int do_check(struct verifier_env *env) if (err) return err; + src_reg_type = regs[insn->src_reg].type; + + if (insn->imm == 0 && BPF_SIZE(insn->code) == BPF_W) { + /* saw a valid insn + * dst_reg = *(u32 *)(src_reg + off) + * use reserved 'imm' field to mark this insn + */ + insn->imm = src_reg_type; + + } else if (src_reg_type != insn->imm && + (src_reg_type == PTR_TO_CTX || + insn->imm == PTR_TO_CTX)) { + /* ABuser program is trying to use the same insn + * dst_reg = *(u32*) (src_reg + off) + * with different pointer types: + * src_reg == ctx in one branch and + * src_reg == stack|map in some other branch. + * Reject it. + */ + verbose("same insn cannot be used with different pointers\n"); + return -EINVAL; + } + } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); @@ -1790,6 +1812,13 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) int i, j; for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) == BPF_LDX && + (BPF_MODE(insn->code) != BPF_MEM || + insn->imm != 0)) { + verbose("BPF_LDX uses reserved fields\n"); + return -EINVAL; + } + if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) { struct bpf_map *map; struct fd f; @@ -1881,6 +1910,92 @@ static void convert_pseudo_ld_imm64(struct verifier_env *env) insn->src_reg = 0; } +static void adjust_branches(struct bpf_prog *prog, int pos, int delta) +{ + struct bpf_insn *insn = prog->insnsi; + int insn_cnt = prog->len; + int i; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (BPF_CLASS(insn->code) != BPF_JMP || + BPF_OP(insn->code) == BPF_CALL || + BPF_OP(insn->code) == BPF_EXIT) + continue; + + /* adjust offset of jmps if necessary */ + if (i < pos && i + insn->off + 1 > pos) + insn->off += delta; + else if (i > pos && i + insn->off + 1 < pos) + insn->off -= delta; + } +} + +/* convert load instructions that access fields of 'struct __sk_buff' + * into sequence of instructions that access fields of 'struct sk_buff' + */ +static int convert_ctx_accesses(struct verifier_env *env) +{ + struct bpf_insn *insn = env->prog->insnsi; + int insn_cnt = env->prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + u32 cnt; + int i; + + if (!env->prog->aux->ops->convert_ctx_access) + return 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_LDX | BPF_MEM | BPF_W)) + continue; + + if (insn->imm != PTR_TO_CTX) { + /* clear internal mark */ + insn->imm = 0; + continue; + } + + cnt = env->prog->aux->ops-> + convert_ctx_access(insn->dst_reg, insn->src_reg, + insn->off, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + if (cnt == 1) { + memcpy(insn, insn_buf, sizeof(*insn)); + continue; + } + + /* several new insns need to be inserted. Make room for them */ + insn_cnt += cnt - 1; + new_prog = bpf_prog_realloc(env->prog, + bpf_prog_size(insn_cnt), + GFP_USER); + if (!new_prog) + return -ENOMEM; + + new_prog->len = insn_cnt; + + memmove(new_prog->insnsi + i + cnt, new_prog->insns + i + 1, + sizeof(*insn) * (insn_cnt - i - cnt)); + + /* copy substitute insns in place of load instruction */ + memcpy(new_prog->insnsi + i, insn_buf, sizeof(*insn) * cnt); + + /* adjust branches in the whole program */ + adjust_branches(new_prog, i, cnt - 1); + + /* keep walking new program and skip insns we just inserted */ + env->prog = new_prog; + insn = new_prog->insnsi + i + cnt - 1; + i += cnt - 1; + } + + return 0; +} + static void free_states(struct verifier_env *env) { struct verifier_state_list *sl, *sln; @@ -1903,13 +2018,13 @@ static void free_states(struct verifier_env *env) kfree(env->explored_states); } -int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) +int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; - if (prog->len <= 0 || prog->len > BPF_MAXINSNS) + if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, @@ -1919,7 +2034,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (!env) return -ENOMEM; - env->prog = prog; + env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); @@ -1951,7 +2066,7 @@ int bpf_check(struct bpf_prog *prog, union bpf_attr *attr) if (ret < 0) goto skip_full_check; - env->explored_states = kcalloc(prog->len, + env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; @@ -1968,6 +2083,10 @@ skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); + if (ret == 0) + /* program is valid, convert *(u32*)(ctx + off) accesses */ + ret = convert_ctx_accesses(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ @@ -1983,18 +2102,18 @@ skip_full_check: if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ - prog->aux->used_maps = kmalloc_array(env->used_map_cnt, - sizeof(env->used_maps[0]), - GFP_KERNEL); + env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, + sizeof(env->used_maps[0]), + GFP_KERNEL); - if (!prog->aux->used_maps) { + if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } - memcpy(prog->aux->used_maps, env->used_maps, + memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); - prog->aux->used_map_cnt = env->used_map_cnt; + env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions @@ -2006,11 +2125,12 @@ free_log_buf: if (log_level) vfree(log_buf); free_env: - if (!prog->aux->used_maps) + if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); + *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; diff --git a/net/core/filter.c b/net/core/filter.c index 33310eee6134..4e9dd0ad0d5b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -150,10 +150,43 @@ static u64 __get_random_u32(u64 ctx, u64 a, u64 x, u64 r4, u64 r5) return prandom_u32(); } +static u32 convert_skb_access(int skb_field, int dst_reg, int src_reg, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (skb_field) { + case SKF_AD_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, mark)); + break; + + case SKF_AD_PKTTYPE: + *insn++ = BPF_LDX_MEM(BPF_B, dst_reg, src_reg, PKT_TYPE_OFFSET()); + *insn++ = BPF_ALU32_IMM(BPF_AND, dst_reg, PKT_TYPE_MAX); +#ifdef __BIG_ENDIAN_BITFIELD + *insn++ = BPF_ALU32_IMM(BPF_RSH, dst_reg, 5); +#endif + break; + + case SKF_AD_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + + *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, + offsetof(struct sk_buff, queue_mapping)); + break; + } + + return insn - insn_buf; +} + static bool convert_bpf_extensions(struct sock_filter *fp, struct bpf_insn **insnp) { struct bpf_insn *insn = *insnp; + u32 cnt; switch (fp->k) { case SKF_AD_OFF + SKF_AD_PROTOCOL: @@ -167,13 +200,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_PKTTYPE: - *insn++ = BPF_LDX_MEM(BPF_B, BPF_REG_A, BPF_REG_CTX, - PKT_TYPE_OFFSET()); - *insn = BPF_ALU32_IMM(BPF_AND, BPF_REG_A, PKT_TYPE_MAX); -#ifdef __BIG_ENDIAN_BITFIELD - insn++; - *insn = BPF_ALU32_IMM(BPF_RSH, BPF_REG_A, 5); -#endif + cnt = convert_skb_access(SKF_AD_PKTTYPE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_IFINDEX: @@ -197,10 +225,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_MARK: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); - - *insn = BPF_LDX_MEM(BPF_W, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, mark)); + cnt = convert_skb_access(SKF_AD_MARK, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_RXHASH: @@ -211,10 +237,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, break; case SKF_AD_OFF + SKF_AD_QUEUE: - BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); - - *insn = BPF_LDX_MEM(BPF_H, BPF_REG_A, BPF_REG_CTX, - offsetof(struct sk_buff, queue_mapping)); + cnt = convert_skb_access(SKF_AD_QUEUE, BPF_REG_A, BPF_REG_CTX, insn); + insn += cnt - 1; break; case SKF_AD_OFF + SKF_AD_VLAN_TAG: @@ -1151,13 +1175,55 @@ sk_filter_func_proto(enum bpf_func_id func_id) static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { - /* skb fields cannot be accessed yet */ - return false; + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* check bounds */ + if (off < 0 || off >= sizeof(struct __sk_buff)) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + /* all __sk_buff fields are __u32 */ + if (size != 4) + return false; + + return true; +} + +static u32 sk_filter_convert_ctx_access(int dst_reg, int src_reg, int ctx_off, + struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + + switch (ctx_off) { + case offsetof(struct __sk_buff, len): + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg, + offsetof(struct sk_buff, len)); + break; + + case offsetof(struct __sk_buff, mark): + return convert_skb_access(SKF_AD_MARK, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, pkt_type): + return convert_skb_access(SKF_AD_PKTTYPE, dst_reg, src_reg, insn); + + case offsetof(struct __sk_buff, queue_mapping): + return convert_skb_access(SKF_AD_QUEUE, dst_reg, src_reg, insn); + } + + return insn - insn_buf; } static const struct bpf_verifier_ops sk_filter_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, }; static struct bpf_prog_type_list sk_filter_type __read_mostly = { -- cgit v1.3-8-gc7d7 From 8cb2c2dc472775479a1a7e78180955f6f1cb0b0a Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Thu, 12 Mar 2015 12:55:13 +0100 Subject: livepatch: Fix subtle race with coming and going modules There is a notifier that handles live patches for coming and going modules. It takes klp_mutex lock to avoid races with coming and going patches but it does not keep the lock all the time. Therefore the following races are possible: 1. The notifier is called sometime in STATE_MODULE_COMING. The module is visible by find_module() in this state all the time. It means that new patch can be registered and enabled even before the notifier is called. It might create wrong order of stacked patches, see below for an example. 2. New patch could still see the module in the GOING state even after the notifier has been called. It will try to initialize the related object structures but the module could disappear at any time. There will stay mess in the structures. It might even cause an invalid memory access. This patch solves the problem by adding a boolean variable into struct module. The value is true after the coming and before the going handler is called. New patches need to be applied when the value is true and they need to ignore the module when the value is false. Note that we need to know state of all modules on the system. The races are related to new patches. Therefore we do not know what modules will get patched. Also note that we could not simply ignore going modules. The code from the module could be called even in the GOING state until mod->exit() finishes. If we start supporting patches with semantic changes between function calls, we need to apply new patches to any still usable code. See below for an example. Finally note that the patch solves only the situation when a new patch is registered. There are no such problems when the patch is being removed. It does not matter who disable the patch first, whether the normal disable_patch() or the module notifier. There is nothing to do once the patch is disabled. Alternative solutions: ====================== + reject new patches when a patched module is coming or going; this is ugly + wait with adding new patch until the module leaves the COMING and GOING states; this might be dangerous and complicated; we would need to release kgr_lock in the middle of the patch registration to avoid a deadlock with the coming and going handlers; also we might need a waitqueue for each module which seems to be even bigger overhead than the boolean + stop modules from entering COMING and GOING states; wait until modules leave these states when they are already there; looks complicated; we would need to ignore the module that asked to stop the others to avoid a deadlock; also it is unclear what to do when two modules asked to stop others and both are in COMING state (situation when two new patches are applied) + always register/enable new patches and fix up the potential mess (registered patches order) in klp_module_init(); this is nasty and prone to regressions in the future development + add another MODULE_STATE where the kallsyms are visible but the module is not used yet; this looks too complex; the module states are checked on "many" locations Example of patch stacking breakage: =================================== The notifier could _not_ _simply_ ignore already initialized module objects. For example, let's have three patches (P1, P2, P3) for functions a() and b() where a() is from vmcore and b() is from a module M. Something like: a() b() P1 a1() b1() P2 a2() b2() P3 a3() b3(3) If you load the module M after all patches are registered and enabled. The ftrace ops for function a() and b() has listed the functions in this order: ops_a->func_stack -> list(a3,a2,a1) ops_b->func_stack -> list(b3,b2,b1) , so the pointer to b3() is the first and will be used. Then you might have the following scenario. Let's start with state when patches P1 and P2 are registered and enabled but the module M is not loaded. Then ftrace ops for b() does not exist. Then we get into the following race: CPU0 CPU1 load_module(M) complete_formation() mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); klp_register_patch(P3); klp_enable_patch(P3); # STATE 1 klp_module_notify(M) klp_module_notify_coming(P1); klp_module_notify_coming(P2); klp_module_notify_coming(P3); # STATE 2 The ftrace ops for a() and b() then looks: STATE1: ops_a->func_stack -> list(a3,a2,a1); ops_b->func_stack -> list(b3); STATE2: ops_a->func_stack -> list(a3,a2,a1); ops_b->func_stack -> list(b2,b1,b3); therefore, b2() is used for the module but a3() is used for vmcore because they were the last added. Example of the race with going modules: ======================================= CPU0 CPU1 delete_module() #SYSCALL try_stop_module() mod->state = MODULE_STATE_GOING; mutex_unlock(&module_mutex); klp_register_patch() klp_enable_patch() #save place to switch universe b() # from module that is going a() # from core (patched) mod->exit(); Note that the function b() can be called until we call mod->exit(). If we do not apply patch against b() because it is in MODULE_STATE_GOING, it will call patched a() with modified semantic and things might get wrong. [jpoimboe@redhat.com: use one boolean instead of two] Signed-off-by: Petr Mladek Acked-by: Josh Poimboeuf Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- include/linux/module.h | 4 ++++ kernel/livepatch/core.c | 30 ++++++++++++++++++++++++++---- 2 files changed, 30 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index b653d7c0a05a..7232fde6a991 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -344,6 +344,10 @@ struct module { unsigned long *ftrace_callsites; #endif +#ifdef CONFIG_LIVEPATCH + bool klp_alive; +#endif + #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ struct list_head source_list; diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 01ca08804f51..3f9f1d6b4c2e 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -89,16 +89,28 @@ static bool klp_is_object_loaded(struct klp_object *obj) /* sets obj->mod if object is not vmlinux and module is found */ static void klp_find_object_module(struct klp_object *obj) { + struct module *mod; + if (!klp_is_module(obj)) return; mutex_lock(&module_mutex); /* - * We don't need to take a reference on the module here because we have - * the klp_mutex, which is also taken by the module notifier. This - * prevents any module from unloading until we release the klp_mutex. + * We do not want to block removal of patched modules and therefore + * we do not take a reference here. The patches are removed by + * a going module handler instead. + */ + mod = find_module(obj->name); + /* + * Do not mess work of the module coming and going notifiers. + * Note that the patch might still be needed before the going handler + * is called. Module functions can be called even in the GOING state + * until mod->exit() finishes. This is especially important for + * patches that modify semantic of the functions. */ - obj->mod = find_module(obj->name); + if (mod && mod->klp_alive) + obj->mod = mod; + mutex_unlock(&module_mutex); } @@ -767,6 +779,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) return -EINVAL; obj->state = KLP_DISABLED; + obj->mod = NULL; klp_find_object_module(obj); @@ -961,6 +974,15 @@ static int klp_module_notify(struct notifier_block *nb, unsigned long action, mutex_lock(&klp_mutex); + /* + * Each module has to know that the notifier has been called. + * We never know what module will get patched by a new patch. + */ + if (action == MODULE_STATE_COMING) + mod->klp_alive = true; + else /* MODULE_STATE_GOING */ + mod->klp_alive = false; + list_for_each_entry(patch, &klp_patches, list) { for (obj = patch->objs; obj->funcs; obj++) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) -- cgit v1.3-8-gc7d7 From 1efff914afac8a965ad63817ecf8861a927c2ace Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 17 Mar 2015 12:23:32 -0400 Subject: fs: add dirtytime_expire_seconds sysctl Add a tuning knob so we can adjust the dirtytime expiration timeout, which is very useful for testing lazytime. Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/fs-writeback.c | 11 +++++++++++ include/linux/writeback.h | 3 +++ kernel/sysctl.c | 8 ++++++++ 3 files changed, 22 insertions(+) (limited to 'kernel') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2cfcd74faf87..32a8bbd7a9ad 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1188,6 +1188,17 @@ static int __init start_dirtytime_writeback(void) } __initcall(start_dirtytime_writeback); +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret == 0 && write) + mod_delayed_work(system_wq, &dirtytime_work, 0); + return ret; +} + static noinline void block_dump___mark_inode_dirty(struct inode *inode) { if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 00048339c23e..b2dd371ec0ca 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -130,6 +130,7 @@ extern int vm_dirty_ratio; extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; +extern unsigned int dirtytime_expire_interval; extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; @@ -146,6 +147,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write, extern int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +int dirtytime_interval_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); struct ctl_table; int dirty_writeback_centisecs_handler(struct ctl_table *, int, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..ce410bb9f2e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1227,6 +1227,14 @@ static struct ctl_table vm_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero, }, + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirty_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = &zero, + }, { .procname = "nr_pdflush_threads", .mode = 0444 /* read-only */, -- cgit v1.3-8-gc7d7 From 431d452af13720463dda498999b2e9a08729c03a Mon Sep 17 00:00:00 2001 From: Zhonghui Fu Date: Wed, 18 Mar 2015 15:54:27 +0100 Subject: PM / sleep: add pm-trace support for suspending phase Occasionally, the system can't come back up after suspend/resume due to problems of device suspending phase. This patch make PM_TRACE infrastructure cover device suspending phase of suspend/resume process, and the information in RTC can tell developers which device suspending function make system hang. Signed-off-by: Zhonghui Fu Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/pm-trace.h | 23 +++++++++++++++++++++++ arch/x86/include/asm/resume-trace.h | 21 --------------------- drivers/base/power/main.c | 20 ++++++++++++++++---- drivers/base/power/trace.c | 6 +++--- include/linux/pm-trace.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/resume-trace.h | 34 ---------------------------------- kernel/power/main.c | 2 +- 7 files changed, 78 insertions(+), 63 deletions(-) create mode 100644 arch/x86/include/asm/pm-trace.h delete mode 100644 arch/x86/include/asm/resume-trace.h create mode 100644 include/linux/pm-trace.h delete mode 100644 include/linux/resume-trace.h (limited to 'kernel') diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h new file mode 100644 index 000000000000..7b7ac42c3661 --- /dev/null +++ b/arch/x86/include/asm/pm-trace.h @@ -0,0 +1,23 @@ +#ifndef _ASM_X86_PM_TRACE_H +#define _ASM_X86_PM_TRACE_H + +#include + +#define TRACE_RESUME(user) \ +do { \ + if (pm_trace_enabled) { \ + const void *tracedata; \ + asm volatile(_ASM_MOV " $1f,%0\n" \ + ".section .tracedata,\"a\"\n" \ + "1:\t.word %c1\n\t" \ + _ASM_PTR " %c2\n" \ + ".previous" \ + :"=r" (tracedata) \ + : "i" (__LINE__), "i" (__FILE__)); \ + generate_pm_trace(tracedata, user); \ + } \ +} while (0) + +#define TRACE_SUSPEND(user) TRACE_RESUME(user) + +#endif /* _ASM_X86_PM_TRACE_H */ diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h deleted file mode 100644 index 3ff1c2cb1da5..000000000000 --- a/arch/x86/include/asm/resume-trace.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _ASM_X86_RESUME_TRACE_H -#define _ASM_X86_RESUME_TRACE_H - -#include - -#define TRACE_RESUME(user) \ -do { \ - if (pm_trace_enabled) { \ - const void *tracedata; \ - asm volatile(_ASM_MOV " $1f,%0\n" \ - ".section .tracedata,\"a\"\n" \ - "1:\t.word %c1\n\t" \ - _ASM_PTR " %c2\n" \ - ".previous" \ - :"=r" (tracedata) \ - : "i" (__LINE__), "i" (__FILE__)); \ - generate_resume_trace(tracedata, user); \ - } \ -} while (0) - -#endif /* _ASM_X86_RESUME_TRACE_H */ diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c index 9717d5f20139..3d874eca7104 100644 --- a/drivers/base/power/main.c +++ b/drivers/base/power/main.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -1017,6 +1017,9 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a char *info = NULL; int error = 0; + TRACE_DEVICE(dev); + TRACE_SUSPEND(0); + if (async_error) goto Complete; @@ -1057,6 +1060,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a Complete: complete_all(&dev->power.completion); + TRACE_SUSPEND(error); return error; } @@ -1078,7 +1082,7 @@ static int device_suspend_noirq(struct device *dev) { reinit_completion(&dev->power.completion); - if (pm_async_enabled && dev->power.async_suspend) { + if (is_async(dev)) { get_device(dev); async_schedule(async_suspend_noirq, dev); return 0; @@ -1157,6 +1161,9 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as char *info = NULL; int error = 0; + TRACE_DEVICE(dev); + TRACE_SUSPEND(0); + __pm_runtime_disable(dev, false); if (async_error) @@ -1198,6 +1205,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as async_error = error; Complete: + TRACE_SUSPEND(error); complete_all(&dev->power.completion); return error; } @@ -1219,7 +1227,7 @@ static int device_suspend_late(struct device *dev) { reinit_completion(&dev->power.completion); - if (pm_async_enabled && dev->power.async_suspend) { + if (is_async(dev)) { get_device(dev); async_schedule(async_suspend_late, dev); return 0; @@ -1338,6 +1346,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) int error = 0; DECLARE_DPM_WATCHDOG_ON_STACK(wd); + TRACE_DEVICE(dev); + TRACE_SUSPEND(0); + dpm_wait_for_children(dev, async); if (async_error) @@ -1444,6 +1455,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async) if (error) async_error = error; + TRACE_SUSPEND(error); return error; } @@ -1465,7 +1477,7 @@ static int device_suspend(struct device *dev) { reinit_completion(&dev->power.completion); - if (pm_async_enabled && dev->power.async_suspend) { + if (is_async(dev)) { get_device(dev); async_schedule(async_suspend, dev); return 0; diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c index d94a1f5121cf..a311cfa4c5bd 100644 --- a/drivers/base/power/trace.c +++ b/drivers/base/power/trace.c @@ -7,7 +7,7 @@ * devices may be working. */ -#include +#include #include #include @@ -154,7 +154,7 @@ EXPORT_SYMBOL(set_trace_device); * it's not any guarantee, but it's a high _likelihood_ that * the match is valid). */ -void generate_resume_trace(const void *tracedata, unsigned int user) +void generate_pm_trace(const void *tracedata, unsigned int user) { unsigned short lineno = *(unsigned short *)tracedata; const char *file = *(const char **)(tracedata + 2); @@ -164,7 +164,7 @@ void generate_resume_trace(const void *tracedata, unsigned int user) file_hash_value = hash_string(lineno, file, FILEHASH); set_magic_time(user_hash_value, file_hash_value, dev_hash_value); } -EXPORT_SYMBOL(generate_resume_trace); +EXPORT_SYMBOL(generate_pm_trace); extern char __tracedata_start, __tracedata_end; static int show_file_hash(unsigned int value) diff --git a/include/linux/pm-trace.h b/include/linux/pm-trace.h new file mode 100644 index 000000000000..ecbde7a5548e --- /dev/null +++ b/include/linux/pm-trace.h @@ -0,0 +1,35 @@ +#ifndef PM_TRACE_H +#define PM_TRACE_H + +#ifdef CONFIG_PM_TRACE +#include +#include + +extern int pm_trace_enabled; + +static inline int pm_trace_is_enabled(void) +{ + return pm_trace_enabled; +} + +struct device; +extern void set_trace_device(struct device *); +extern void generate_pm_trace(const void *tracedata, unsigned int user); +extern int show_trace_dev_match(char *buf, size_t size); + +#define TRACE_DEVICE(dev) do { \ + if (pm_trace_enabled) \ + set_trace_device(dev); \ + } while(0) + +#else + +static inline int pm_trace_is_enabled(void) { return 0; } + +#define TRACE_DEVICE(dev) do { } while (0) +#define TRACE_RESUME(dev) do { } while (0) +#define TRACE_SUSPEND(dev) do { } while (0) + +#endif + +#endif diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h deleted file mode 100644 index f31db2368782..000000000000 --- a/include/linux/resume-trace.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef RESUME_TRACE_H -#define RESUME_TRACE_H - -#ifdef CONFIG_PM_TRACE -#include -#include - -extern int pm_trace_enabled; - -static inline int pm_trace_is_enabled(void) -{ - return pm_trace_enabled; -} - -struct device; -extern void set_trace_device(struct device *); -extern void generate_resume_trace(const void *tracedata, unsigned int user); -extern int show_trace_dev_match(char *buf, size_t size); - -#define TRACE_DEVICE(dev) do { \ - if (pm_trace_enabled) \ - set_trace_device(dev); \ - } while(0) - -#else - -static inline int pm_trace_is_enabled(void) { return 0; } - -#define TRACE_DEVICE(dev) do { } while (0) -#define TRACE_RESUME(dev) do { } while (0) - -#endif - -#endif diff --git a/kernel/power/main.c b/kernel/power/main.c index 9a59d042ea84..86e8157a450f 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.3-8-gc7d7 From 3fa0818b3c85e9bb55e3ac96c9523b87e44eab9e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Mar 2015 12:12:07 -0400 Subject: sched, isolcpu: make cpu_isolated_map visible outside scheduler Needed by the next patch. Also makes cpu_isolated_map present when compiled without SMP and/or with CONFIG_NR_CPUS=1, like the other cpu masks. At some point we may want to clean things up so cpumasks do not exist in UP kernels. Maybe something for the CONFIG_TINY crowd. Cc: Peter Zijlstra Cc: Clark Williams Cc: Li Zefan Cc: Ingo Molnar Cc: Luiz Capitulino Cc: David Rientjes Cc: Mike Galbraith Cc: cgroups@vger.kernel.org Signed-off-by: Rik van Riel Acked-by: Zefan Li Signed-off-by: Tejun Heo --- include/linux/sched.h | 2 ++ kernel/sched/core.c | 6 +++--- 2 files changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..ca365d79480c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -329,6 +329,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern cpumask_var_t cpu_isolated_map; + extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..b578bb23410b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -306,6 +306,9 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; +/* cpus with isolated domains */ +cpumask_var_t cpu_isolated_map; + /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -5811,9 +5814,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) update_top_cache_domain(cpu); } -/* cpus with isolated domains */ -static cpumask_var_t cpu_isolated_map; - /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { -- cgit v1.3-8-gc7d7 From 47b8ea7186aae7f474ec4c98f43eaa8da719cd83 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 9 Mar 2015 12:12:08 -0400 Subject: cpusets, isolcpus: exclude isolcpus from load balancing in cpusets Ensure that cpus specified with the isolcpus= boot commandline option stay outside of the load balancing in the kernel scheduler. Operations like load balancing can introduce unwanted latencies, which is exactly what the isolcpus= commandline is there to prevent. Previously, simply creating a new cpuset, without even touching the cpuset.cpus field inside the new cpuset, would undo the effects of isolcpus=, by creating a scheduler domain spanning the whole system, and setting up load balancing inside that domain. The cpuset root cpuset.cpus file is read-only, so there was not even a way to undo that effect. This does not impact the majority of cpusets users, since isolcpus= is a fairly specialized feature used for realtime purposes. Cc: Peter Zijlstra Cc: Clark Williams Cc: Li Zefan Cc: Ingo Molnar Cc: Luiz Capitulino Cc: Mike Galbraith Cc: cgroups@vger.kernel.org Signed-off-by: Rik van Riel Tested-by: David Rientjes Acked-by: Peter Zijlstra (Intel) Acked-by: David Rientjes Acked-by: Zefan Li Signed-off-by: Tejun Heo --- kernel/cpuset.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index fc7f4748d34a..c68f0721df10 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -622,6 +622,7 @@ static int generate_sched_domains(cpumask_var_t **domains, int csn; /* how many cpuset ptrs in csa so far */ int i, j, k; /* indices for partition finding loops */ cpumask_var_t *doms; /* resulting partition; i.e. sched domains */ + cpumask_var_t non_isolated_cpus; /* load balanced CPUs */ struct sched_domain_attr *dattr; /* attributes for custom domains */ int ndoms = 0; /* number of sched domains in result */ int nslot; /* next empty doms[] struct cpumask slot */ @@ -631,6 +632,10 @@ static int generate_sched_domains(cpumask_var_t **domains, dattr = NULL; csa = NULL; + if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL)) + goto done; + cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); + /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { ndoms = 1; @@ -643,7 +648,8 @@ static int generate_sched_domains(cpumask_var_t **domains, *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } - cpumask_copy(doms[0], top_cpuset.effective_cpus); + cpumask_and(doms[0], top_cpuset.effective_cpus, + non_isolated_cpus); goto done; } @@ -666,7 +672,8 @@ static int generate_sched_domains(cpumask_var_t **domains, * the corresponding sched domain. */ if (!cpumask_empty(cp->cpus_allowed) && - !is_sched_load_balance(cp)) + !(is_sched_load_balance(cp) && + cpumask_intersects(cp->cpus_allowed, non_isolated_cpus))) continue; if (is_sched_load_balance(cp)) @@ -748,6 +755,7 @@ restart: if (apn == b->pn) { cpumask_or(dp, dp, b->effective_cpus); + cpumask_and(dp, dp, non_isolated_cpus); if (dattr) update_domain_attr_tree(dattr + nslot, b); @@ -760,6 +768,7 @@ restart: BUG_ON(nslot != ndoms); done: + free_cpumask_var(non_isolated_cpus); kfree(csa); /* -- cgit v1.3-8-gc7d7 From a77da14ce9afb338040b405f6ab8afddc310411d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 8 Mar 2015 14:52:27 -0700 Subject: rcu: Yet another fix for preemption and CPU hotplug As noted earlier, the following sequence of events can occur when running PREEMPT_RCU and HOTPLUG_CPU on a system with a multi-level rcu_node combining tree: 1. A group of tasks block on CPUs corresponding to a given leaf rcu_node structure while within RCU read-side critical sections. 2. All CPUs corrsponding to that rcu_node structure go offline. 3. The next grace period starts, but because there are still tasks blocked, the upper-level bits corresponding to this leaf rcu_node structure remain set. 4. All the tasks exit their RCU read-side critical sections and remove themselves from the leaf rcu_node structure's list, leaving it empty. 5. But because there now is code to check for this condition at force-quiescent-state time, the upper bits are cleared and the grace period completes. However, there is another complication that can occur following step 4 above: 4a. The grace period starts, and the leaf rcu_node structure's gp_tasks pointer is set to NULL because there are no tasks blocked on this structure. 4b. One of the CPUs corresponding to the leaf rcu_node structure comes back online. 4b. An endless stream of tasks are preempted within RCU read-side critical sections on this CPU, such that the ->blkd_tasks list is always non-empty. The grace period will never end. This commit therefore makes the force-quiescent-state processing check only for absence of tasks blocking the current grace period rather than absence of tasks altogether. This will cause a quiescent state to be reported if the current leaf rcu_node structure is not blocking the current grace period and its parent thinks that it is, regardless of how RCU managed to get itself into this state. Signed-off-by: Paul E. McKenney Cc: # 4.0.x Tested-by: Sasha Levin --- kernel/rcu/tree.c | 43 +++++++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 17b5abf999ca..b3684b284677 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2199,8 +2199,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, unsigned long mask; struct rcu_node *rnp_p; - WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state); - if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { + if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || + rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -2208,9 +2208,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, rnp_p = rnp->parent; if (rnp_p == NULL) { /* - * Either there is only one rcu_node in the tree, - * or tasks were kicked up to root rcu_node due to - * CPUs going offline. + * Only one rcu_node structure in the tree, so don't + * try to report up to its nonexistent parent! */ rcu_report_qs_rsp(rsp, flags); return; @@ -2713,8 +2712,29 @@ static void force_qs_rnp(struct rcu_state *rsp, return; } if (rnp->qsmask == 0) { - rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ - continue; + if (rcu_state_p == &rcu_sched_state || + rsp != rcu_state_p || + rcu_preempt_blocked_readers_cgp(rnp)) { + /* + * No point in scanning bits because they + * are all zero. But we might need to + * priority-boost blocked readers. + */ + rcu_initiate_boost(rnp, flags); + /* rcu_initiate_boost() releases rnp->lock */ + continue; + } + if (rnp->parent && + (rnp->parent->qsmask & rnp->grpmask)) { + /* + * Race between grace-period + * initialization and task exiting RCU + * read-side critical section: Report. + */ + rcu_report_unblock_qs_rnp(rsp, rnp, flags); + /* rcu_report_unblock_qs_rnp() rlses ->lock */ + continue; + } } cpu = rnp->grplo; bit = 1; @@ -2729,15 +2749,6 @@ static void force_qs_rnp(struct rcu_state *rsp, if (mask != 0) { /* Idle/offline CPUs, report. */ rcu_report_qs_rnp(mask, rsp, rnp, flags); - } else if (rnp->parent && - list_empty(&rnp->blkd_tasks) && - !rnp->qsmask && - (rnp->parent->qsmask & rnp->grpmask)) { - /* - * Race between grace-period initialization and task - * existing RCU read-side critical section, report. - */ - rcu_report_unblock_qs_rnp(rsp, rnp, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); -- cgit v1.3-8-gc7d7 From 654e953340491e498871321d7e2c9b0a12821933 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 15 Mar 2015 09:19:35 -0700 Subject: rcu: Associate quiescent-state reports with grace period As noted in earlier commit logs, CPU hotplug operations running concurrently with grace-period initialization can result in a given leaf rcu_node structure having all CPUs offline and no blocked readers, but with this rcu_node structure nevertheless blocking the current grace period. Therefore, the quiescent-state forcing code now checks for this situation and repairs it. Unfortunately, this checking can result in false positives, for example, when the last task has just removed itself from this leaf rcu_node structure, but has not yet started clearing the ->qsmask bits further up the structure. This means that the grace-period kthread (which forces quiescent states) and some other task might be attempting to concurrently clear these ->qsmask bits. This is usually not a problem: One of these tasks will be the first to acquire the upper-level rcu_node structure's lock and with therefore clear the bit, and the other task, seeing the bit already cleared, will stop trying to clear bits. Sadly, this means that the following unusual sequence of events -can- result in a problem: 1. The grace-period kthread wins, and clears the ->qsmask bits. 2. This is the last thing blocking the current grace period, so that the grace-period kthread clears ->qsmask bits all the way to the root and finds that the root ->qsmask field is now zero. 3. Another grace period is required, so that the grace period kthread initializes it, including setting all the needed qsmask bits. 4. The leaf rcu_node structure (the one that started this whole mess) is blocking this new grace period, either because it has at least one online CPU or because there is at least one task that had blocked within an RCU read-side critical section while running on one of this leaf rcu_node structure's CPUs. (And yes, that CPU might well have gone offline before the grace period in step (3) above started, which can mean that there is a task on the leaf rcu_node structure's ->blkd_tasks list, but ->qsmask equal to zero.) 5. The other kthread didn't get around to trying to clear the upper level ->qsmask bits until all the above had happened. This means that it now sees bits set in the upper-level ->qsmask field, so it proceeds to clear them. Too bad that it is doing so on behalf of a quiescent state that does not apply to the current grace period! This sequence of events can result in the new grace period being too short. It can also result in the new grace period ending before the leaf rcu_node structure's ->qsmask bits have been cleared, which will result in splats during initialization of the next grace period. In addition, it can result in tasks blocking the new grace period still being queued at the start of the next grace period, which will result in other splats. Sasha's testing turned up another of these splats, as did rcutorture testing. (And yes, rcutorture is being adjusted to make these splats show up more quickly. Which probably is having the undesirable side effect of making other problems show up less quickly. Can't have everything!) Reported-by: Sasha Levin Signed-off-by: Paul E. McKenney Cc: # 4.0.x Tested-by: Sasha Levin --- kernel/rcu/tree.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b3684b284677..8fcc64ed858c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2132,25 +2132,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) * Similar to rcu_report_qs_rdp(), for which it is a helper function. * Allows quiescent states for a group of CPUs to be reported at one go * to the specified rcu_node structure, though all the CPUs in the group - * must be represented by the same rcu_node structure (which need not be - * a leaf rcu_node structure, though it often will be). That structure's - * lock must be held upon entry, and it is released before return. + * must be represented by the same rcu_node structure (which need not be a + * leaf rcu_node structure, though it often will be). The gps parameter + * is the grace-period snapshot, which means that the quiescent states + * are valid only if rnp->gpnum is equal to gps. That structure's lock + * must be held upon entry, and it is released before return. */ static void rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, - struct rcu_node *rnp, unsigned long flags) + struct rcu_node *rnp, unsigned long gps, unsigned long flags) __releases(rnp->lock) { + unsigned long oldmask = 0; struct rcu_node *rnp_c; /* Walk up the rcu_node hierarchy. */ for (;;) { - if (!(rnp->qsmask & mask)) { + if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { - /* Our bit has already been cleared, so done. */ + /* + * Our bit has already been cleared, or the + * relevant grace period is already over, so done. + */ raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } + WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ rnp->qsmask &= ~mask; trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, mask, rnp->qsmask, rnp->level, @@ -2174,7 +2181,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, rnp = rnp->parent; raw_spin_lock_irqsave(&rnp->lock, flags); smp_mb__after_unlock_lock(); - WARN_ON_ONCE(rnp_c->qsmask); + oldmask = rnp_c->qsmask; } /* @@ -2196,6 +2203,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { + unsigned long gps; unsigned long mask; struct rcu_node *rnp_p; @@ -2215,12 +2223,13 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, return; } - /* Report up the rest of the hierarchy. */ + /* Report up the rest of the hierarchy, tracking current ->gpnum. */ + gps = rnp->gpnum; mask = rnp->grpmask; raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ smp_mb__after_unlock_lock(); - rcu_report_qs_rnp(mask, rsp, rnp_p, flags); + rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } /* @@ -2271,7 +2280,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); + /* ^^^ Released rnp->lock */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -2747,8 +2757,8 @@ static void force_qs_rnp(struct rcu_state *rsp, } } if (mask != 0) { - /* Idle/offline CPUs, report. */ - rcu_report_qs_rnp(mask, rsp, rnp, flags); + /* Idle/offline CPUs, report (releases rnp->lock. */ + rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ raw_spin_unlock_irqrestore(&rnp->lock, flags); -- cgit v1.3-8-gc7d7 From 94caee8c312d96522bcdae88791aaa9ebcd5f22c Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 20 Mar 2015 15:11:11 +0100 Subject: ebpf: add sched_act_type and map it to sk_filter's verifier ops In order to prepare eBPF support for tc action, we need to add sched_act_type, so that the eBPF verifier is aware of what helper function act_bpf may use, that it can load skb data and read out currently available skb fields. This is bascially analogous to 96be4325f443 ("ebpf: add sched_cls_type and map it to sk_filter's verifier ops"). BPF_PROG_TYPE_SCHED_CLS and BPF_PROG_TYPE_SCHED_ACT need to be separate since both will have a different set of functionality in future (classifier vs action), thus we won't run into ABI troubles when the point in time comes to diverge functionality from the classifier. The future plan for act_bpf would be that it will be able to write into skb->data and alter selected fields mirrored in struct __sk_buff. For an initial support, it's sufficient to map it to sk_filter_ops. Signed-off-by: Daniel Borkmann Cc: Jiri Pirko Reviewed-by: Jiri Pirko Acked-by: Alexei Starovoitov Signed-off-by: David S. Miller --- include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 1 + net/core/filter.c | 6 ++++++ 3 files changed, 8 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 1623047af463..3dd314a45d0d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -119,6 +119,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, }; #define BPF_PSEUDO_MAP_FD 1 diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c22ebd36fa4b..0e714f799ec0 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1180,6 +1180,7 @@ static bool may_access_skb(enum bpf_prog_type type) switch (type) { case BPF_PROG_TYPE_SOCKET_FILTER: case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: return true; default: return false; diff --git a/net/core/filter.c b/net/core/filter.c index bdaac5895def..084eacc4d1d4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1263,10 +1263,16 @@ static struct bpf_prog_type_list sched_cls_type __read_mostly = { .type = BPF_PROG_TYPE_SCHED_CLS, }; +static struct bpf_prog_type_list sched_act_type __read_mostly = { + .ops = &sk_filter_ops, + .type = BPF_PROG_TYPE_SCHED_ACT, +}; + static int __init register_sk_filter_ops(void) { bpf_register_prog_type(&sk_filter_type); bpf_register_prog_type(&sched_cls_type); + bpf_register_prog_type(&sched_act_type); return 0; } -- cgit v1.3-8-gc7d7 From d525211f9d1be8b523ec7633f080f2116f5ea536 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Feb 2015 18:03:11 +0100 Subject: perf: Fix irq_work 'tail' recursion Vince reported a watchdog lockup like: [] perf_tp_event+0xc4/0x210 [] perf_trace_lock+0x12a/0x160 [] lock_release+0x130/0x260 [] _raw_spin_unlock_irqrestore+0x24/0x40 [] do_send_sig_info+0x5d/0x80 [] send_sigio_to_task+0x12f/0x1a0 [] send_sigio+0xae/0x100 [] kill_fasync+0x97/0xf0 [] perf_event_wakeup+0xd4/0xf0 [] perf_pending_event+0x33/0x60 [] irq_work_run_list+0x4c/0x80 [] irq_work_run+0x18/0x40 [] smp_trace_irq_work_interrupt+0x3f/0xc0 [] trace_irq_work_interrupt+0x6d/0x80 Which is caused by an irq_work generating new irq_work and therefore not allowing forward progress. This happens because processing the perf irq_work triggers another perf event (tracepoint stuff) which in turn generates an irq_work ad infinitum. Avoid this by raising the recursion counter in the irq_work -- which effectively disables all software events (including tracepoints) from actually triggering again. Reported-by: Vince Weaver Tested-by: Vince Weaver Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Paul Mackerras Cc: Steven Rostedt Cc: Link: http://lkml.kernel.org/r/20150219170311.GH21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index 453ef61311d4..2fabc0627165 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4574,6 +4574,13 @@ static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); + int rctx; + + rctx = perf_swevent_get_recursion_context(); + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ if (event->pending_disable) { event->pending_disable = 0; @@ -4584,6 +4591,9 @@ static void perf_pending_event(struct irq_work *entry) event->pending_wakeup = 0; perf_event_wakeup(event); } + + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } /* -- cgit v1.3-8-gc7d7 From 746db9443ea57fd9c059f62c4bfbf41cf224fe13 Mon Sep 17 00:00:00 2001 From: Brian Silverman Date: Wed, 18 Feb 2015 16:23:56 -0800 Subject: sched: Fix RLIMIT_RTTIME when PI-boosting to RT When non-realtime tasks get priority-inheritance boosted to a realtime scheduling class, RLIMIT_RTTIME starts to apply to them. However, the counter used for checking this (the same one used for SCHED_RR timeslices) was not getting reset. This meant that tasks running with a non-realtime scheduling class which are repeatedly boosted to a realtime one, but never block while they are running realtime, eventually hit the timeout without ever running for a time over the limit. This patch resets the realtime timeslice counter when un-PI-boosting from an RT to a non-RT scheduling class. I have some test code with two threads and a shared PTHREAD_PRIO_INHERIT mutex which induces priority boosting and spins while boosted that gets killed by a SIGXCPU on non-fixed kernels but doesn't with this patch applied. It happens much faster with a CONFIG_PREEMPT_RT kernel, and does happen eventually with PREEMPT_VOLUNTARY kernels. Signed-off-by: Brian Silverman Signed-off-by: Peter Zijlstra (Intel) Cc: austin@peloton-tech.com Cc: Link: http://lkml.kernel.org/r/1424305436-6716-1-git-send-email-brian@peloton-tech.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..62671f53202a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3034,6 +3034,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio) } else { if (dl_prio(oldprio)) p->dl.dl_boosted = 0; + if (rt_prio(oldprio)) + p->rt.timeout = 0; p->sched_class = &fair_sched_class; } -- cgit v1.3-8-gc7d7 From 35a9393c95b31870a74f51a3e7455f33f5657b6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 26 Feb 2015 16:23:11 +0100 Subject: lockdep: Fix the module unload key range freeing logic Module unload calls lockdep_free_key_range(), which removes entries from the data structures. Most of the lockdep code OTOH assumes the data structures are append only; in specific see the comments in add_lock_to_list() and look_up_lock_class(). Clearly this has only worked by accident; make it work proper. The actual scenario to make it go boom would involve the memory freed by the module unlock being re-allocated and re-used for a lock inside of a rcu-sched grace period. This is a very unlikely scenario, still better plug the hole. Use RCU list iteration in all places and ammend the comments. Change lockdep_free_key_range() to issue a sync_sched() between removal from the lists and returning -- which results in the memory being freed. Further ensure the callers are placed correctly and comment the requirements. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Andrey Tsyvarev Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rusty Russell Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 81 ++++++++++++++++++++++++++++++++---------------- kernel/module.c | 8 ++--- 2 files changed, 59 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 88d0d4420ad2..ba77ab5f64dd 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -633,7 +633,7 @@ static int count_matching_names(struct lock_class *new_class) if (!new_class->name) return 0; - list_for_each_entry(class, &all_lock_classes, lock_entry) { + list_for_each_entry_rcu(class, &all_lock_classes, lock_entry) { if (new_class->key - new_class->subclass == class->key) return class->name_version; if (class->name && !strcmp(class->name, new_class->name)) @@ -700,10 +700,12 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) hash_head = classhashentry(key); /* - * We can walk the hash lockfree, because the hash only - * grows, and we are careful when adding entries to the end: + * We do an RCU walk of the hash, see lockdep_free_key_range(). */ - list_for_each_entry(class, hash_head, hash_entry) { + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return NULL; + + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -728,7 +730,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) struct lockdep_subclass_key *key; struct list_head *hash_head; struct lock_class *class; - unsigned long flags; + + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); if (likely(class)) @@ -750,28 +753,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) key = lock->key->subkeys + subclass; hash_head = classhashentry(key); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } /* * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry(class, hash_head, hash_entry) + list_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; + } + /* * Allocate a new key from the static array, and add it to * the hash: */ if (nr_lock_classes >= MAX_LOCKDEP_KEYS) { if (!debug_locks_off_graph_unlock()) { - raw_local_irq_restore(flags); return NULL; } - raw_local_irq_restore(flags); print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); @@ -798,7 +799,6 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) if (verbose(class)) { graph_unlock(); - raw_local_irq_restore(flags); printk("\nnew class %p: %s", class->key, class->name); if (class->name_version > 1) @@ -806,15 +806,12 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) printk("\n"); dump_stack(); - raw_local_irq_save(flags); if (!graph_lock()) { - raw_local_irq_restore(flags); return NULL; } } out_unlock_set: graph_unlock(); - raw_local_irq_restore(flags); out_set_class_cache: if (!subclass || force) @@ -870,11 +867,9 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, entry->distance = distance; entry->trace = *trace; /* - * Since we never remove from the dependency list, the list can - * be walked lockless by other CPUs, it's only allocation - * that must be protected by the spinlock. But this also means - * we must make new entries visible only once writes to the - * entry become visible - hence the RCU op: + * Both allocation and removal are done under the graph lock; but + * iteration is under RCU-sched; see look_up_lock_class() and + * lockdep_free_key_range(). */ list_add_tail_rcu(&entry->entry, head); @@ -1025,7 +1020,9 @@ static int __bfs(struct lock_list *source_entry, else head = &lock->class->locks_before; - list_for_each_entry(entry, head, entry) { + DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + + list_for_each_entry_rcu(entry, head, entry) { if (!lock_accessed(entry)) { unsigned int cq_depth; mark_lock_accessed(entry, lock); @@ -2022,7 +2019,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry(chain, hash_head, entry) { + list_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2996,8 +2993,18 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name, if (unlikely(!debug_locks)) return; - if (subclass) + if (subclass) { + unsigned long flags; + + if (DEBUG_LOCKS_WARN_ON(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; register_lock_class(lock, subclass, 1); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); + } } EXPORT_SYMBOL_GPL(lockdep_init_map); @@ -3887,9 +3894,17 @@ static inline int within(const void *addr, void *start, unsigned long size) return addr >= start && addr < start + size; } +/* + * Used in module.c to remove lock classes from memory that is going to be + * freed; and possibly re-used by other modules. + * + * We will have had one sync_sched() before getting here, so we're guaranteed + * nobody will look up these exact classes -- they're properly dead but still + * allocated. + */ void lockdep_free_key_range(void *start, unsigned long size) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i; @@ -3905,7 +3920,7 @@ void lockdep_free_key_range(void *start, unsigned long size) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3916,11 +3931,25 @@ void lockdep_free_key_range(void *start, unsigned long size) if (locked) graph_unlock(); raw_local_irq_restore(flags); + + /* + * Wait for any possible iterators from look_up_lock_class() to pass + * before continuing to free the memory they refer to. + * + * sync_sched() is sufficient because the read-side is IRQ disable. + */ + synchronize_sched(); + + /* + * XXX at this point we could return the resources to the pool; + * instead we leak them. We would need to change to bitmap allocators + * instead of the linear allocators we have now. + */ } void lockdep_reset_lock(struct lockdep_map *lock) { - struct lock_class *class, *next; + struct lock_class *class; struct list_head *head; unsigned long flags; int i, j; @@ -3948,7 +3977,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) head = classhash_table + i; if (list_empty(head)) continue; - list_for_each_entry_safe(class, next, head, hash_entry) { + list_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) diff --git a/kernel/module.c b/kernel/module.c index b3d634ed06c9..99fdf94efce8 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1865,7 +1865,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - /* Free lock-classes: */ + /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod->module_core, mod->core_size); /* Finally, free the core (containing the module structure) */ @@ -3349,9 +3349,6 @@ static int load_module(struct load_info *info, const char __user *uargs, module_bug_cleanup(mod); mutex_unlock(&module_mutex); - /* Free lock-classes: */ - lockdep_free_key_range(mod->module_core, mod->core_size); - /* we can't deallocate the module until we clear memory protection */ unset_module_init_ro_nx(mod); unset_module_core_ro_nx(mod); @@ -3375,6 +3372,9 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_rcu(); mutex_unlock(&module_mutex); free_module: + /* Free lock-classes; relies on the preceding sync_rcu() */ + lockdep_free_key_range(mod->module_core, mod->core_size); + module_deallocate(mod, info); free_copy: free_copy(info); -- cgit v1.3-8-gc7d7 From a127d2bcf1fbc8c8e0b5cf0dab54f7d3ff50ce47 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Wed, 18 Mar 2015 16:19:27 +0530 Subject: timers/tick/broadcast-hrtimer: Fix suspicious RCU usage in idle loop The hrtimer mode of broadcast queues hrtimers in the idle entry path so as to wakeup cpus in deep idle states. The associated call graph is : cpuidle_idle_call() |____ clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, ....)) |_____tick_broadcast_set_event() |____clockevents_program_event() |____bc_set_next() The hrtimer_{start/cancel} functions call into tracing which uses RCU. But it is not legal to call into RCU in cpuidle because it is one of the quiescent states. Hence protect this region with RCU_NONIDLE which informs RCU that the cpu is momentarily non-idle. As an aside it is helpful to point out that the clock event device that is programmed here is not a per-cpu clock device; it is a pseudo clock device, used by the broadcast framework alone. The per-cpu clock device programming never goes through bc_set_next(). Signed-off-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Paul E. McKenney Cc: linuxppc-dev@ozlabs.org Cc: mpe@ellerman.id.au Cc: tglx@linutronix.de Link: http://lkml.kernel.org/r/20150318104705.17763.56668.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast-hrtimer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index eb682d5c697c..6aac4beedbbe 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -49,6 +49,7 @@ static void bc_set_mode(enum clock_event_mode mode, */ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) { + int bc_moved; /* * We try to cancel the timer first. If the callback is on * flight on some other cpu then we let it handle it. If we @@ -60,9 +61,15 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc) * restart the timer because we are in the callback, but we * can set the expiry time and let the callback return * HRTIMER_RESTART. + * + * Since we are in the idle loop at this point and because + * hrtimer_{start/cancel} functions call into tracing, + * calls to these functions must be bound within RCU_NONIDLE. */ - if (hrtimer_try_to_cancel(&bctimer) >= 0) { - hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED); + RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ? + !hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) : + 0); + if (bc_moved) { /* Bind the "device" to the cpu */ bc->bound_on = smp_processor_id(); } else if (bc->bound_on == smp_processor_id()) { -- cgit v1.3-8-gc7d7 From b6366f048e0caff28af5335b7af2031266e1b06b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 18 Mar 2015 14:49:46 -0400 Subject: sched/rt: Use IPI to trigger RT task push migration instead of pulling When debugging the latencies on a 40 core box, where we hit 300 to 500 microsecond latencies, I found there was a huge contention on the runqueue locks. Investigating it further, running ftrace, I found that it was due to the pulling of RT tasks. The test that was run was the following: cyclictest --numa -p95 -m -d0 -i100 This created a thread on each CPU, that would set its wakeup in iterations of 100 microseconds. The -d0 means that all the threads had the same interval (100us). Each thread sleeps for 100us and wakes up and measures its latencies. cyclictest is maintained at: git://git.kernel.org/pub/scm/linux/kernel/git/clrkwllms/rt-tests.git What happened was another RT task would be scheduled on one of the CPUs that was running our test, when the other CPU tests went to sleep and scheduled idle. This caused the "pull" operation to execute on all these CPUs. Each one of these saw the RT task that was overloaded on the CPU of the test that was still running, and each one tried to grab that task in a thundering herd way. To grab the task, each thread would do a double rq lock grab, grabbing its own lock as well as the rq of the overloaded CPU. As the sched domains on this box was rather flat for its size, I saw up to 12 CPUs block on this lock at once. This caused a ripple affect with the rq locks especially since the taking was done via a double rq lock, which means that several of the CPUs had their own rq locks held while trying to take this rq lock. As these locks were blocked, any wakeups or load balanceing on these CPUs would also block on these locks, and the wait time escalated. I've tried various methods to lessen the load, but things like an atomic counter to only let one CPU grab the task wont work, because the task may have a limited affinity, and we may pick the wrong CPU to take that lock and do the pull, to only find out that the CPU we picked isn't in the task's affinity. Instead of doing the PULL, I now have the CPUs that want the pull to send over an IPI to the overloaded CPU, and let that CPU pick what CPU to push the task to. No more need to grab the rq lock, and the push/pull algorithm still works fine. With this patch, the latency dropped to just 150us over a 20 hour run. Without the patch, the huge latencies would trigger in seconds. I've created a new sched feature called RT_PUSH_IPI, which is enabled by default. When RT_PUSH_IPI is not enabled, the old method of grabbing the rq locks and having the pulling CPU do the work is implemented. When RT_PUSH_IPI is enabled, the IPI is sent to the overloaded CPU to do a push. To enabled or disable this at run time: # mount -t debugfs nodev /sys/kernel/debug # echo RT_PUSH_IPI > /sys/kernel/debug/sched_features or # echo NO_RT_PUSH_IPI > /sys/kernel/debug/sched_features Update: This original patch would send an IPI to all CPUs in the RT overload list. But that could theoretically cause the reverse issue. That is, there could be lots of overloaded RT queues and one CPU lowers its priority. It would then send an IPI to all the overloaded RT queues and they could then all try to grab the rq lock of the CPU lowering its priority, and then we have the same problem. The latest design sends out only one IPI to the first overloaded CPU. It tries to push any tasks that it can, and then looks for the next overloaded CPU that can push to the source CPU. The IPIs stop when all overloaded CPUs that have pushable tasks that have priorities greater than the source CPU are covered. In case the source CPU lowers its priority again, a flag is set to tell the IPI traversal to restart with the first RT overloaded CPU after the source CPU. Parts-suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra (Intel) Cc: Joern Engel Cc: Clark Williams Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150318144946.2f3cc982@gandalf.local.home Signed-off-by: Ingo Molnar --- kernel/sched/features.h | 13 ++++ kernel/sched/rt.c | 177 ++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 12 ++++ 3 files changed, 202 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 90284d117fe6..91e33cd485f6 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,19 @@ SCHED_FEAT(NONTASK_CAPACITY, true) */ SCHED_FEAT(TTWU_QUEUE, true) +#ifdef HAVE_RT_PUSH_IPI +/* + * In order to avoid a thundering herd attack of CPUs that are + * lowering their priorities at the same time, and there being + * a single CPU that has an RT task that can migrate and is waiting + * to run, where the other CPUs will try to take that CPUs + * rq lock and possibly create a large contention, sending an + * IPI to that CPU and let that CPU push the RT task to where + * it should go may be a better scenario. + */ +SCHED_FEAT(RT_PUSH_IPI, true) +#endif + SCHED_FEAT(FORCE_SD_OVERLAP, false) SCHED_FEAT(RT_RUNTIME_SHARE, true) SCHED_FEAT(LB_MIN, false) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f4d4b077eba0..ad0241561c3e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include +#include int sched_rr_timeslice = RR_TIMESLICE; @@ -59,6 +60,10 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_unlock(&rt_b->rt_runtime_lock); } +#ifdef CONFIG_SMP +static void push_irq_work_func(struct irq_work *work); +#endif + void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) { struct rt_prio_array *array; @@ -78,7 +83,14 @@ void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); + +#ifdef HAVE_RT_PUSH_IPI + rt_rq->push_flags = 0; + rt_rq->push_cpu = nr_cpu_ids; + raw_spin_lock_init(&rt_rq->push_lock); + init_irq_work(&rt_rq->push_work, push_irq_work_func); #endif +#endif /* CONFIG_SMP */ /* We start is dequeued state, because no RT tasks are queued */ rt_rq->rt_queued = 0; @@ -1778,6 +1790,164 @@ static void push_rt_tasks(struct rq *rq) ; } +#ifdef HAVE_RT_PUSH_IPI +/* + * The search for the next cpu always starts at rq->cpu and ends + * when we reach rq->cpu again. It will never return rq->cpu. + * This returns the next cpu to check, or nr_cpu_ids if the loop + * is complete. + * + * rq->rt.push_cpu holds the last cpu returned by this function, + * or if this is the first instance, it must hold rq->cpu. + */ +static int rto_next_cpu(struct rq *rq) +{ + int prev_cpu = rq->rt.push_cpu; + int cpu; + + cpu = cpumask_next(prev_cpu, rq->rd->rto_mask); + + /* + * If the previous cpu is less than the rq's CPU, then it already + * passed the end of the mask, and has started from the beginning. + * We end if the next CPU is greater or equal to rq's CPU. + */ + if (prev_cpu < rq->cpu) { + if (cpu >= rq->cpu) + return nr_cpu_ids; + + } else if (cpu >= nr_cpu_ids) { + /* + * We passed the end of the mask, start at the beginning. + * If the result is greater or equal to the rq's CPU, then + * the loop is finished. + */ + cpu = cpumask_first(rq->rd->rto_mask); + if (cpu >= rq->cpu) + return nr_cpu_ids; + } + rq->rt.push_cpu = cpu; + + /* Return cpu to let the caller know if the loop is finished or not */ + return cpu; +} + +static int find_next_push_cpu(struct rq *rq) +{ + struct rq *next_rq; + int cpu; + + while (1) { + cpu = rto_next_cpu(rq); + if (cpu >= nr_cpu_ids) + break; + next_rq = cpu_rq(cpu); + + /* Make sure the next rq can push to this rq */ + if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr) + break; + } + + return cpu; +} + +#define RT_PUSH_IPI_EXECUTING 1 +#define RT_PUSH_IPI_RESTART 2 + +static void tell_cpu_to_push(struct rq *rq) +{ + int cpu; + + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + raw_spin_lock(&rq->rt.push_lock); + /* Make sure it's still executing */ + if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) { + /* + * Tell the IPI to restart the loop as things have + * changed since it started. + */ + rq->rt.push_flags |= RT_PUSH_IPI_RESTART; + raw_spin_unlock(&rq->rt.push_lock); + return; + } + raw_spin_unlock(&rq->rt.push_lock); + } + + /* When here, there's no IPI going around */ + + rq->rt.push_cpu = rq->cpu; + cpu = find_next_push_cpu(rq); + if (cpu >= nr_cpu_ids) + return; + + rq->rt.push_flags = RT_PUSH_IPI_EXECUTING; + + irq_work_queue_on(&rq->rt.push_work, cpu); +} + +/* Called from hardirq context */ +static void try_to_push_tasks(void *arg) +{ + struct rt_rq *rt_rq = arg; + struct rq *rq, *src_rq; + int this_cpu; + int cpu; + + this_cpu = rt_rq->push_cpu; + + /* Paranoid check */ + BUG_ON(this_cpu != smp_processor_id()); + + rq = cpu_rq(this_cpu); + src_rq = rq_of_rt_rq(rt_rq); + +again: + if (has_pushable_tasks(rq)) { + raw_spin_lock(&rq->lock); + push_rt_task(rq); + raw_spin_unlock(&rq->lock); + } + + /* Pass the IPI to the next rt overloaded queue */ + raw_spin_lock(&rt_rq->push_lock); + /* + * If the source queue changed since the IPI went out, + * we need to restart the search from that CPU again. + */ + if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) { + rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART; + rt_rq->push_cpu = src_rq->cpu; + } + + cpu = find_next_push_cpu(src_rq); + + if (cpu >= nr_cpu_ids) + rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING; + raw_spin_unlock(&rt_rq->push_lock); + + if (cpu >= nr_cpu_ids) + return; + + /* + * It is possible that a restart caused this CPU to be + * chosen again. Don't bother with an IPI, just see if we + * have more to push. + */ + if (unlikely(cpu == rq->cpu)) + goto again; + + /* Try the next RT overloaded CPU */ + irq_work_queue_on(&rt_rq->push_work, cpu); +} + +static void push_irq_work_func(struct irq_work *work) +{ + struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work); + + try_to_push_tasks(rt_rq); +} +#endif /* HAVE_RT_PUSH_IPI */ + static int pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, ret = 0, cpu; @@ -1793,6 +1963,13 @@ static int pull_rt_task(struct rq *this_rq) */ smp_rmb(); +#ifdef HAVE_RT_PUSH_IPI + if (sched_feat(RT_PUSH_IPI)) { + tell_cpu_to_push(this_rq); + return 0; + } +#endif + for_each_cpu(cpu, this_rq->rd->rto_mask) { if (this_cpu == cpu) continue; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dc0f435a2779..c2c0d7bd5027 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -418,6 +419,11 @@ static inline int rt_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* RT IPI pull logic requires IRQ_WORK */ +#ifdef CONFIG_IRQ_WORK +# define HAVE_RT_PUSH_IPI +#endif + /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; @@ -435,7 +441,13 @@ struct rt_rq { unsigned long rt_nr_total; int overloaded; struct plist_head pushable_tasks; +#ifdef HAVE_RT_PUSH_IPI + int push_flags; + int push_cpu; + struct irq_work push_work; + raw_spinlock_t push_lock; #endif +#endif /* CONFIG_SMP */ int rt_queued; int rt_throttled; -- cgit v1.3-8-gc7d7 From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2015 22:10:19 +0100 Subject: perf: Remove type specific target pointers The only reason CQM had to use a hard-coded pmu type was so it could use cqm_target in hw_perf_event. Do away with the {tp,bp,cqm}_target pointers and provide a non type specific one. This allows us to do away with that silly pmu type as well. Signed-off-by: Peter Zijlstra (Intel) Cc: Vince Weaver Cc: acme@kernel.org Cc: acme@redhat.com Cc: hpa@zytor.com Cc: jolsa@redhat.com Cc: kanaka.d.juvva@intel.com Cc: matt.fleming@intel.com Cc: tglx@linutronix.de Cc: torvalds@linux-foundation.org Cc: vikas.shivappa@linux.intel.com Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/arm/kernel/hw_breakpoint.c | 2 +- arch/arm64/kernel/hw_breakpoint.c | 2 +- arch/x86/kernel/cpu/perf_event_intel_cqm.c | 7 +++---- include/linux/perf_event.h | 4 +--- include/uapi/linux/perf_event.h | 1 - kernel/events/core.c | 14 ++++---------- kernel/events/hw_breakpoint.c | 8 ++++---- kernel/trace/trace_uprobe.c | 10 +++++----- 8 files changed, 19 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index 7fc70ae21185..dc7d0a95bd36 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Per-cpu breakpoints are not supported by our stepping * mechanism. */ - if (!bp->hw.bp_target) + if (!bp->hw.target) return -EINVAL; /* diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index df1cf15377b4..d062f35911c2 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) * Disallow per-task kernel breakpoints since these would * complicate the stepping code. */ - if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target) + if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target) return -EINVAL; return 0; diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index 9a8ef8376fcd..e4d1b8b738fa 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) /* * Events that target same task are placed into the same cache group. */ - if (a->hw.cqm_target == b->hw.cqm_target) + if (a->hw.target == b->hw.target) return true; /* @@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b) static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event) { if (event->attach_state & PERF_ATTACH_TASK) - return perf_cgroup_from_task(event->hw.cqm_target); + return perf_cgroup_from_task(event->hw.target); return event->cgrp; } @@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void) __perf_cpu_notifier(intel_cqm_cpu_notifier); - ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", - PERF_TYPE_INTEL_CQM); + ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1); if (ret) pr_err("Intel CQM perf registration failed: %d\n", ret); else diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dac4c2831d82..5aa49d7bfd07 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -119,7 +119,6 @@ struct hw_perf_event { struct hrtimer hrtimer; }; struct { /* tracepoint */ - struct task_struct *tp_target; /* for tp_event->class */ struct list_head tp_list; }; @@ -129,7 +128,6 @@ struct hw_perf_event { struct list_head cqm_events_entry; struct list_head cqm_groups_entry; struct list_head cqm_group_entry; - struct task_struct *cqm_target; }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ @@ -138,12 +136,12 @@ struct hw_perf_event { * problem hw_breakpoint has with context * creation and event initalization. */ - struct task_struct *bp_target; struct arch_hw_breakpoint info; struct list_head bp_list; }; #endif }; + struct task_struct *target; int state; local64_t prev_count; u64 sample_period; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3c8b45de57ec..1e3cd07cf76e 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -32,7 +32,6 @@ enum perf_type_id { PERF_TYPE_HW_CACHE = 3, PERF_TYPE_RAW = 4, PERF_TYPE_BREAKPOINT = 5, - PERF_TYPE_INTEL_CQM = 6, PERF_TYPE_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 71109a045450..525062b6fba1 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7171,18 +7171,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, if (task) { event->attach_state = PERF_ATTACH_TASK; - - if (attr->type == PERF_TYPE_TRACEPOINT) - event->hw.tp_target = task; -#ifdef CONFIG_HAVE_HW_BREAKPOINT /* - * hw_breakpoint is a bit difficult here.. + * XXX pmu::event_init needs to know what task to account to + * and we cannot use the ctx information because we need the + * pmu before we get a ctx. */ - else if (attr->type == PERF_TYPE_BREAKPOINT) - event->hw.bp_target = task; -#endif - else if (attr->type == PERF_TYPE_INTEL_CQM) - event->hw.cqm_target = task; + event->hw.target = task; } if (!overflow_handler && parent_event) { diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 9803a6600d49..92ce5f4ccc26 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) { - struct task_struct *tsk = bp->hw.bp_target; + struct task_struct *tsk = bp->hw.target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->hw.bp_target == tsk && + if (iter->hw.target == tsk && find_slot_idx(iter) == type && (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); @@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, int nr; nr = info->cpu_pinned; - if (!bp->hw.bp_target) + if (!bp->hw.target) nr += max_task_bp_pinned(cpu, type); else nr += task_bp_pinned(cpu, bp, type); @@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, weight = -weight; /* Pinned counter cpu profiling */ - if (!bp->hw.bp_target) { + if (!bp->hw.target) { get_bp_info(bp->cpu, type)->cpu_pinned += weight; return; } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index b11441321e7a..93fdc7791eaa 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -1005,7 +1005,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) return true; list_for_each_entry(event, &filter->perf_events, hw.tp_list) { - if (event->hw.tp_target->mm == mm) + if (event->hw.target->mm == mm) return true; } @@ -1015,7 +1015,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm) static inline bool uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event) { - return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm); + return __uprobe_perf_filter(&tu->filter, event->hw.target->mm); } static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) @@ -1023,10 +1023,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event) bool done; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { list_del(&event->hw.tp_list); done = tu->filter.nr_systemwide || - (event->hw.tp_target->flags & PF_EXITING) || + (event->hw.target->flags & PF_EXITING) || uprobe_filter_event(tu, event); } else { tu->filter.nr_systemwide--; @@ -1046,7 +1046,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event) int err; write_lock(&tu->filter.rwlock); - if (event->hw.tp_target) { + if (event->hw.target) { /* * event->parent != NULL means copy_process(), we can avoid * uprobe_apply(). current->mm must be probed and we can rely -- cgit v1.3-8-gc7d7 From 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 23 Mar 2015 20:21:51 -0300 Subject: x86: kvm: Revert "remove sched notifier for cross-cpu migrations" The following point: 2. per-CPU pvclock time info is updated if the underlying CPU changes. Is not true anymore since "KVM: x86: update pvclock area conditionally, on cpu migration". Add task migration notification back. Problem noticed by Andy Lutomirski. Signed-off-by: Marcelo Tosatti CC: stable@kernel.org # 3.11+ --- arch/x86/include/asm/pvclock.h | 1 + arch/x86/kernel/pvclock.c | 44 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/vdso/vclock_gettime.c | 16 +++++++-------- include/linux/sched.h | 8 ++++++++ kernel/sched/core.c | 15 ++++++++++++++ 5 files changed, 76 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index d6b078e9fa28..25b1cc07d496 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, struct pvclock_vsyscall_time_info { struct pvclock_vcpu_time_info pvti; + u32 migrate_count; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d229a58..e5ecd20e72dd 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + #ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + pvclock_vdso_info = i; + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } + + register_task_migration_notifier(&pvclock_migrate); + return 0; } #endif diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 9793322751e0..30933760ee5f 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode) cycle_t ret; u64 last; u32 version; + u32 migrate_count; u8 flags; unsigned cpu, cpu1; /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. - * + * When looping to get a consistent (time-info, tsc) pair, we + * also need to deal with the possibility we can switch vcpus, + * so make sure we always re-fetch time-info for the current vcpu. */ do { cpu = __getcpu() & VGETCPU_CPU_MASK; @@ -104,6 +101,8 @@ static notrace cycle_t vread_pvclock(int *mode) pvti = get_pvti(cpu); + migrate_count = pvti->migrate_count; + version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); /* @@ -115,7 +114,8 @@ static notrace cycle_t vread_pvclock(int *mode) cpu1 = __getcpu() & VGETCPU_CPU_MASK; } while (unlikely(cpu != cpu1 || (pvti->pvti.version & 1) || - pvti->pvti.version != version)); + pvti->pvti.version != version || + pvti->migrate_count != migrate_count)); if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) *mode = VCLOCK_NONE; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..be98910cc1e2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); extern void update_cpu_load_nohz(void); +/* Notifier for when a task gets migrated to a new CPU */ +struct task_migration_notifier { + struct task_struct *task; + int from_cpu; + int to_cpu; +}; +extern void register_task_migration_notifier(struct notifier_block *n); + extern unsigned long get_parent_ip(unsigned long addr); extern void dump_cpu_task(int cpu); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f0f831e8a345..d0c4209bb836 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -996,6 +996,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) rq_clock_skip_update(rq, true); } +static ATOMIC_NOTIFIER_HEAD(task_migration_notifier); + +void register_task_migration_notifier(struct notifier_block *n) +{ + atomic_notifier_chain_register(&task_migration_notifier, n); +} + #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int new_cpu) { @@ -1026,10 +1033,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) trace_sched_migrate_task(p, new_cpu); if (task_cpu(p) != new_cpu) { + struct task_migration_notifier tmn; + if (p->sched_class->migrate_task_rq) p->sched_class->migrate_task_rq(p, new_cpu); p->se.nr_migrations++; perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0); + + tmn.task = p; + tmn.from_cpu = task_cpu(p); + tmn.to_cpu = new_cpu; + + atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn); } __set_task_cpu(p, new_cpu); -- cgit v1.3-8-gc7d7 From e6beaa363d56d7fc2f8cd6f7291e4d93911a428a Mon Sep 17 00:00:00 2001 From: "Tom(JeHyeon) Yeon" Date: Wed, 18 Mar 2015 14:03:30 +0900 Subject: locking/rtmutex: Rename argument in the rt_mutex_adjust_prio_chain() documentation as well The following commit changed "deadlock_detect" to "chwalk": 8930ed80f970 ("rtmutex: Cleanup deadlock detector debug logic") do that rename in the function's documentation as well. Signed-off-by: Tom(JeHyeon) Yeon Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/1426655010-31651-1-git-send-email-tom.yeon@windriver.com Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index e16e5542bf13..c0b8e9db6b2e 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -349,7 +349,7 @@ static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) * * @task: the task owning the mutex (owner) for which a chain walk is * probably needed - * @deadlock_detect: do we have to carry out deadlock detection? + * @chwalk: do we have to carry out deadlock detection? * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck * things for a task that has just got its priority adjusted, and * is waiting on a mutex) -- cgit v1.3-8-gc7d7 From 80a9b64e2c156b6523e7a01f2ba6e5d86e722814 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 17 Mar 2015 10:40:38 -0400 Subject: ring-buffer: Replace this_cpu_*() with __this_cpu_*() It has come to my attention that this_cpu_read/write are horrible on architectures other than x86. Worse yet, they actually disable preemption or interrupts! This caused some unexpected tracing results on ARM. 101.356868: preempt_count_add <-ring_buffer_lock_reserve 101.356870: preempt_count_sub <-ring_buffer_lock_reserve The ring_buffer_lock_reserve has recursion protection that requires accessing a per cpu variable. But since preempt_disable() is traced, it too got traced while accessing the variable that is suppose to prevent recursion like this. The generic version of this_cpu_read() and write() are: #define this_cpu_generic_read(pcp) \ ({ typeof(pcp) ret__; \ preempt_disable(); \ ret__ = *this_cpu_ptr(&(pcp)); \ preempt_enable(); \ ret__; \ }) #define this_cpu_generic_to_op(pcp, val, op) \ do { \ unsigned long flags; \ raw_local_irq_save(flags); \ *__this_cpu_ptr(&(pcp)) op val; \ raw_local_irq_restore(flags); \ } while (0) Which is unacceptable for locations that know they are within preempt disabled or interrupt disabled locations. Paul McKenney stated that __this_cpu_() versions produce much better code on other architectures than this_cpu_() does, if we know that the call is done in a preempt disabled location. I also changed the recursive_unlock() to use two local variables instead of accessing the per_cpu variable twice. Link: http://lkml.kernel.org/r/20150317114411.GE3589@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20150317104038.312e73d1@gandalf.local.home Cc: stable@vger.kernel.org Acked-by: Christoph Lameter Reported-by: Uwe Kleine-Koenig Tested-by: Uwe Kleine-Koenig Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 5040d44fe5a3..922048a0f7ea 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2679,7 +2679,7 @@ static DEFINE_PER_CPU(unsigned int, current_context); static __always_inline int trace_recursive_lock(void) { - unsigned int val = this_cpu_read(current_context); + unsigned int val = __this_cpu_read(current_context); int bit; if (in_interrupt()) { @@ -2696,18 +2696,17 @@ static __always_inline int trace_recursive_lock(void) return 1; val |= (1 << bit); - this_cpu_write(current_context, val); + __this_cpu_write(current_context, val); return 0; } static __always_inline void trace_recursive_unlock(void) { - unsigned int val = this_cpu_read(current_context); + unsigned int val = __this_cpu_read(current_context); - val--; - val &= this_cpu_read(current_context); - this_cpu_write(current_context, val); + val &= val & (val - 1); + __this_cpu_write(current_context, val); } #else -- cgit v1.3-8-gc7d7 From bbedb179944c29e5e449603163eec9951116fe39 Mon Sep 17 00:00:00 2001 From: Scott Wood Date: Wed, 11 Mar 2015 22:13:57 -0500 Subject: tracing: %pF is only for function pointers Use %pS for actual addresses, otherwise you'll get bad output on arches like ppc64 where %pF expects a function descriptor. Link: http://lkml.kernel.org/r/1426130037-17956-22-git-send-email-scottwood@freescale.com Signed-off-by: Scott Wood Signed-off-by: Steven Rostedt --- include/trace/events/btrfs.h | 4 ++-- include/trace/events/ext3.h | 2 +- include/trace/events/ext4.h | 6 +++--- include/trace/events/module.h | 4 ++-- include/trace/events/random.h | 10 +++++----- kernel/trace/trace_entries.h | 6 +++--- tools/lib/traceevent/event-parse.c | 2 +- 7 files changed, 17 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 1faecea101f3..572e6503394a 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -962,7 +962,7 @@ TRACE_EVENT(alloc_extent_state, __entry->ip = IP ), - TP_printk("state=%p; mask = %s; caller = %pF", __entry->state, + TP_printk("state=%p; mask = %s; caller = %pS", __entry->state, show_gfp_flags(__entry->mask), (void *)__entry->ip) ); @@ -982,7 +982,7 @@ TRACE_EVENT(free_extent_state, __entry->ip = IP ), - TP_printk(" state=%p; caller = %pF", __entry->state, + TP_printk(" state=%p; caller = %pS", __entry->state, (void *)__entry->ip) ); diff --git a/include/trace/events/ext3.h b/include/trace/events/ext3.h index 6797b9de90ed..7f20707849bb 100644 --- a/include/trace/events/ext3.h +++ b/include/trace/events/ext3.h @@ -144,7 +144,7 @@ TRACE_EVENT(ext3_mark_inode_dirty, __entry->ip = IP; ), - TP_printk("dev %d,%d ino %lu caller %pF", + TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 6e5abd6d38a2..47fca36ee426 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -240,7 +240,7 @@ TRACE_EVENT(ext4_mark_inode_dirty, __entry->ip = IP; ), - TP_printk("dev %d,%d ino %lu caller %pF", + TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); @@ -1762,7 +1762,7 @@ TRACE_EVENT(ext4_journal_start, __entry->rsv_blocks = rsv_blocks; ), - TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pF", + TP_printk("dev %d,%d blocks, %d rsv_blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, (void *)__entry->ip) ); @@ -1784,7 +1784,7 @@ TRACE_EVENT(ext4_journal_start_reserved, __entry->blocks = blocks; ), - TP_printk("dev %d,%d blocks, %d caller %pF", + TP_printk("dev %d,%d blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, (void *)__entry->ip) ); diff --git a/include/trace/events/module.h b/include/trace/events/module.h index 81c4c183d348..28c45997e451 100644 --- a/include/trace/events/module.h +++ b/include/trace/events/module.h @@ -84,7 +84,7 @@ DECLARE_EVENT_CLASS(module_refcnt, __assign_str(name, mod->name); ), - TP_printk("%s call_site=%pf refcnt=%d", + TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); @@ -121,7 +121,7 @@ TRACE_EVENT(module_request, __assign_str(name, name); ), - TP_printk("%s wait=%d call_site=%pf", + TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); diff --git a/include/trace/events/random.h b/include/trace/events/random.h index 805af6db41cc..4684de344c5d 100644 --- a/include/trace/events/random.h +++ b/include/trace/events/random.h @@ -22,7 +22,7 @@ TRACE_EVENT(add_device_randomness, __entry->IP = IP; ), - TP_printk("bytes %d caller %pF", + TP_printk("bytes %d caller %pS", __entry->bytes, (void *)__entry->IP) ); @@ -43,7 +43,7 @@ DECLARE_EVENT_CLASS(random__mix_pool_bytes, __entry->IP = IP; ), - TP_printk("%s pool: bytes %d caller %pF", + TP_printk("%s pool: bytes %d caller %pS", __entry->pool_name, __entry->bytes, (void *)__entry->IP) ); @@ -82,7 +82,7 @@ TRACE_EVENT(credit_entropy_bits, ), TP_printk("%s pool: bits %d entropy_count %d entropy_total %d " - "caller %pF", __entry->pool_name, __entry->bits, + "caller %pS", __entry->pool_name, __entry->bits, __entry->entropy_count, __entry->entropy_total, (void *)__entry->IP) ); @@ -207,7 +207,7 @@ DECLARE_EVENT_CLASS(random__get_random_bytes, __entry->IP = IP; ), - TP_printk("nbytes %d caller %pF", __entry->nbytes, (void *)__entry->IP) + TP_printk("nbytes %d caller %pS", __entry->nbytes, (void *)__entry->IP) ); DEFINE_EVENT(random__get_random_bytes, get_random_bytes, @@ -242,7 +242,7 @@ DECLARE_EVENT_CLASS(random__extract_entropy, __entry->IP = IP; ), - TP_printk("%s pool: nbytes %d entropy_count %d caller %pF", + TP_printk("%s pool: nbytes %d entropy_count %d caller %pS", __entry->pool_name, __entry->nbytes, __entry->entropy_count, (void *)__entry->IP) ); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index e2d027ac66a2..ee7b94a4810a 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,7 +223,7 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->fmt), FILTER_OTHER @@ -238,7 +238,7 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->buf), FILTER_OTHER @@ -253,7 +253,7 @@ FTRACE_ENTRY(bputs, bputs_entry, __field( const char *, str ) ), - F_printk("%pf: %s", + F_printk("%ps: %s", (void *)__entry->ip, __entry->str), FILTER_OTHER diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index afe20ed9fac8..2c0bd8f2aad0 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3976,7 +3976,7 @@ static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struc if (asprintf(&arg->atom.atom, "%lld", ip) < 0) goto out_free; - /* skip the first "%pf: " */ + /* skip the first "%ps: " */ for (ptr = fmt + 5, bptr = data + field->offset; bptr < data + size && *ptr; ptr++) { int ls = 0; -- cgit v1.3-8-gc7d7 From 754cb0071a5c9576ccfa6523969ef6a2f6a71676 Mon Sep 17 00:00:00 2001 From: He Kuang Date: Tue, 3 Mar 2015 15:21:33 +0800 Subject: tracing: remove ftrace:function TRACE_EVENT_FL_USE_CALL_FILTER flag TRACE_EVENT_FL_USE_CALL_FILTER flag in ftrace:functon event can be removed. This flag was first introduced in commit f306cc82a93d ("tracing: Update event filters for multibuffer"). Now, the only place uses this flag is ftrace:function, but the filter of ftrace:function has a different code path with events/syscalls and events/tracepoints. It uses ftrace_filter_write() and perf's ftrace_profile_set_filter() to set the filter, the functionality of file 'tracing/events/ftrace/function/filter' is bypassed in function init_pred(), in which case, neither call->filter nor file->filter is used. So we can safely remove TRACE_EVENT_FL_USE_CALL_FILTER flag from ftrace:function events. Link: http://lkml.kernel.org/r/1425367294-27852-1-git-send-email-hekuang@huawei.com Signed-off-by: He Kuang Signed-off-by: Steven Rostedt --- kernel/trace/trace_export.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 12e2b99be862..174a6a71146c 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -177,7 +177,7 @@ struct ftrace_event_call __used event_##call = { \ }, \ .event.type = etype, \ .print_fmt = print, \ - .flags = TRACE_EVENT_FL_IGNORE_ENABLE | TRACE_EVENT_FL_USE_CALL_FILTER, \ + .flags = TRACE_EVENT_FL_IGNORE_ENABLE, \ }; \ struct ftrace_event_call __used \ __attribute__((section("_ftrace_events"))) *__event_##call = &event_##call; -- cgit v1.3-8-gc7d7 From d9a16d3ab8770357015c85a07387f1d2676a4773 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 12 Mar 2015 16:58:34 +1100 Subject: trace: Don't use __weak in header files The commit that added a check for this to checkpatch says: "Using weak declarations can have unintended link defects. The __weak on the declaration causes non-weak definitions to become weak." In this case, when a PowerPC kernel is built with CONFIG_KPROBE_EVENT but not CONFIG_UPROBE_EVENT, it generates the following warning: WARNING: 1 bad relocations c0000000014f2190 R_PPC64_ADDR64 uprobes_fetch_type_table This is fixed by passing the fetch_table arrays to traceprobe_parse_probe_arg() which also means that they can never be NULL. Link: http://lkml.kernel.org/r/20150312165834.4482cb48@canb.auug.org.au Acked-by: Masami Hiramatsu Signed-off-by: Stephen Rothwell Signed-off-by: Steven Rostedt --- kernel/trace/trace_kprobe.c | 5 +++-- kernel/trace/trace_probe.c | 19 +++++++------------ kernel/trace/trace_probe.h | 10 ++-------- kernel/trace/trace_uprobe.c | 5 +++-- 4 files changed, 15 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..f34c3ad1b5f4 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -250,7 +250,7 @@ DEFINE_FETCH_symbol(string_size) #define fetch_file_offset_string_size NULL /* Fetch type information table */ -const struct fetch_type kprobes_fetch_type_table[] = { +static const struct fetch_type kprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -760,7 +760,8 @@ static int create_trace_kprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tk->tp.size, parg, - is_return, true); + is_return, true, + kprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index b983b2fd2ca1..1769a81da8a7 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -356,17 +356,14 @@ static int parse_probe_vars(char *arg, const struct fetch_type *t, /* Recursive argument parser */ static int parse_probe_arg(char *arg, const struct fetch_type *t, - struct fetch_param *f, bool is_return, bool is_kprobe) + struct fetch_param *f, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { - const struct fetch_type *ftbl; unsigned long param; long offset; char *tmp; int ret = 0; - ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; - BUG_ON(ftbl == NULL); - switch (arg[0]) { case '$': ret = parse_probe_vars(arg + 1, t, f, is_return, is_kprobe); @@ -447,7 +444,7 @@ static int parse_probe_arg(char *arg, const struct fetch_type *t, dprm->fetch_size = get_fetch_size_function(t, dprm->fetch, ftbl); ret = parse_probe_arg(arg, t2, &dprm->orig, is_return, - is_kprobe); + is_kprobe, ftbl); if (ret) kfree(dprm); else { @@ -505,15 +502,12 @@ static int __parse_bitfield_probe_arg(const char *bf, /* String length checking wrapper */ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe) + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl) { - const struct fetch_type *ftbl; const char *t; int ret; - ftbl = is_kprobe ? kprobes_fetch_type_table : uprobes_fetch_type_table; - BUG_ON(ftbl == NULL); - if (strlen(arg) > MAX_ARGSTR_LEN) { pr_info("Argument is too long.: %s\n", arg); return -ENOSPC; @@ -535,7 +529,8 @@ int traceprobe_parse_probe_arg(char *arg, ssize_t *size, } parg->offset = *size; *size += parg->type->size; - ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, is_kprobe); + ret = parse_probe_arg(arg, parg->type, &parg->fetch, is_return, + is_kprobe, ftbl); if (ret >= 0 && t != NULL) ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch); diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h index 4f815fbce16d..e30f6cce4af6 100644 --- a/kernel/trace/trace_probe.h +++ b/kernel/trace/trace_probe.h @@ -229,13 +229,6 @@ ASSIGN_FETCH_FUNC(file_offset, ftype), \ #define FETCH_TYPE_STRING 0 #define FETCH_TYPE_STRSIZE 1 -/* - * Fetch type information table. - * It's declared as a weak symbol due to conditional compilation. - */ -extern __weak const struct fetch_type kprobes_fetch_type_table[]; -extern __weak const struct fetch_type uprobes_fetch_type_table[]; - #ifdef CONFIG_KPROBE_EVENT struct symbol_cache; unsigned long update_symbol_cache(struct symbol_cache *sc); @@ -333,7 +326,8 @@ find_event_file_link(struct trace_probe *tp, struct ftrace_event_file *file) } extern int traceprobe_parse_probe_arg(char *arg, ssize_t *size, - struct probe_arg *parg, bool is_return, bool is_kprobe); + struct probe_arg *parg, bool is_return, bool is_kprobe, + const struct fetch_type *ftbl); extern int traceprobe_conflict_field_name(const char *name, struct probe_arg *args, int narg); diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 7dc1c8abecd6..74865465e0b7 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -196,7 +196,7 @@ DEFINE_FETCH_file_offset(string) DEFINE_FETCH_file_offset(string_size) /* Fetch type information table */ -const struct fetch_type uprobes_fetch_type_table[] = { +static const struct fetch_type uprobes_fetch_type_table[] = { /* Special types */ [FETCH_TYPE_STRING] = __ASSIGN_FETCH_TYPE("string", string, string, sizeof(u32), 1, "__data_loc char[]"), @@ -535,7 +535,8 @@ static int create_trace_uprobe(int argc, char **argv) /* Parse fetch argument */ ret = traceprobe_parse_probe_arg(arg, &tu->tp.size, parg, - is_return, false); + is_return, false, + uprobes_fetch_type_table); if (ret) { pr_info("Parse error at argument[%d]. (%d)\n", i, ret); goto error; -- cgit v1.3-8-gc7d7 From 074c238177a75f5e79af3b2cb6a84e54823ef950 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 25 Mar 2015 15:55:42 -0700 Subject: mm: numa: slow PTE scan rate if migration failures occur Dave Chinner reported the following on https://lkml.org/lkml/2015/3/1/226 Across the board the 4.0-rc1 numbers are much slower, and the degradation is far worse when using the large memory footprint configs. Perf points straight at the cause - this is from 4.0-rc1 on the "-o bhash=101073" config: - 56.07% 56.07% [kernel] [k] default_send_IPI_mask_sequence_phys - default_send_IPI_mask_sequence_phys - 99.99% physflat_send_IPI_mask - 99.37% native_send_call_func_ipi smp_call_function_many - native_flush_tlb_others - 99.85% flush_tlb_page ptep_clear_flush try_to_unmap_one rmap_walk try_to_unmap migrate_pages migrate_misplaced_page - handle_mm_fault - 99.73% __do_page_fault trace_do_page_fault do_async_page_fault + async_page_fault 0.63% native_send_call_func_single_ipi generic_exec_single smp_call_function_single This is showing excessive migration activity even though excessive migrations are meant to get throttled. Normally, the scan rate is tuned on a per-task basis depending on the locality of faults. However, if migrations fail for any reason then the PTE scanner may scan faster if the faults continue to be remote. This means there is higher system CPU overhead and fault trapping at exactly the time we know that migrations cannot happen. This patch tracks when migration failures occur and slows the PTE scanner. Signed-off-by: Mel Gorman Reported-by: Dave Chinner Tested-by: Dave Chinner Cc: Ingo Molnar Cc: Aneesh Kumar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 9 +++++---- kernel/sched/fair.c | 8 ++++++-- mm/huge_memory.c | 3 ++- mm/memory.c | 3 ++- 4 files changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..a419b65770d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1625,11 +1625,11 @@ struct task_struct { /* * numa_faults_locality tracks if faults recorded during the last - * scan window were remote/local. The task scan period is adapted - * based on the locality of the faults with different weights - * depending on whether they were shared or private faults + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults */ - unsigned long numa_faults_locality[2]; + unsigned long numa_faults_locality[3]; unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ @@ -1719,6 +1719,7 @@ struct task_struct { #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 #define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7ce18f3c097a..bcfe32088b37 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1609,9 +1609,11 @@ static void update_task_scan_period(struct task_struct *p, /* * If there were no record hinting faults then either the task is * completely idle or all activity is areas that are not of interest - * to automatic numa balancing. Scan slower + * to automatic numa balancing. Related to that, if there were failed + * migration then it implies we are migrating too quickly or the local + * node is overloaded. In either case, scan slower */ - if (local + shared == 0) { + if (local + shared == 0 || p->numa_faults_locality[2]) { p->numa_scan_period = min(p->numa_scan_period_max, p->numa_scan_period << 1); @@ -2080,6 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (migrated) p->numa_pages_migrated += pages; + if (flags & TNF_MIGRATE_FAIL) + p->numa_faults_locality[2] += pages; p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages; p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0a42d1521aa4..51b3e7c64622 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1350,7 +1350,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; - } + } else + flags |= TNF_MIGRATE_FAIL; goto out; clear_pmdnuma: diff --git a/mm/memory.c b/mm/memory.c index d20e12da3a3c..97839f5c8c30 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3103,7 +3103,8 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, if (migrated) { page_nid = target_nid; flags |= TNF_MIGRATED; - } + } else + flags |= TNF_MIGRATE_FAIL; out: if (page_nid != -1) -- cgit v1.3-8-gc7d7 From e2e40f2c1ed433c5e224525c8c862fd32e5d3df2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 22 Feb 2015 08:58:50 -0800 Subject: fs: move struct kiocb to fs.h struct kiocb now is a generic I/O container, so move it to fs.h. Also do a #include diet for aio.h while we're at it. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- arch/s390/hypfs/inode.c | 2 +- drivers/char/mem.c | 2 +- drivers/char/tile-srom.c | 1 - drivers/infiniband/hw/ipath/ipath_file_ops.c | 1 - drivers/infiniband/hw/qib/qib_file_ops.c | 1 - drivers/misc/mei/amthif.c | 1 - drivers/misc/mei/main.c | 1 - drivers/misc/mei/pci-me.c | 1 - drivers/scsi/sg.c | 2 +- drivers/staging/unisys/include/timskmod.h | 1 - drivers/usb/gadget/function/f_fs.c | 1 + drivers/usb/gadget/legacy/inode.c | 1 + fs/9p/vfs_addr.c | 2 +- fs/affs/file.c | 2 +- fs/afs/write.c | 1 - fs/bfs/inode.c | 1 + fs/block_dev.c | 1 - fs/btrfs/file.c | 2 +- fs/btrfs/inode.c | 2 +- fs/ceph/file.c | 1 - fs/direct-io.c | 1 - fs/ecryptfs/file.c | 1 - fs/ext2/inode.c | 2 +- fs/ext3/inode.c | 2 +- fs/ext4/file.c | 2 +- fs/ext4/indirect.c | 2 +- fs/ext4/inode.c | 1 - fs/ext4/page-io.c | 1 - fs/f2fs/data.c | 2 +- fs/fat/inode.c | 1 - fs/fuse/cuse.c | 2 +- fs/fuse/dev.c | 1 - fs/fuse/file.c | 2 +- fs/gfs2/aops.c | 2 +- fs/gfs2/file.c | 1 - fs/hfs/inode.c | 2 +- fs/hfsplus/inode.c | 2 +- fs/jfs/inode.c | 2 +- fs/nfs/file.c | 1 - fs/nilfs2/inode.c | 2 +- fs/ntfs/file.c | 1 - fs/ntfs/inode.c | 1 - fs/ocfs2/aops.c | 1 + fs/ocfs2/aops.h | 2 +- fs/pipe.c | 1 - fs/read_write.c | 1 - fs/reiserfs/inode.c | 2 +- fs/splice.c | 1 - fs/ubifs/file.c | 1 - fs/udf/file.c | 2 +- fs/udf/inode.c | 2 +- fs/xfs/xfs_aops.c | 1 - fs/xfs/xfs_file.c | 1 - include/linux/aio.h | 31 +--------------------------- include/linux/fs.h | 22 ++++++++++++++++++++ include/net/sock.h | 1 - kernel/printk/printk.c | 2 +- kernel/sysctl.c | 1 + mm/filemap.c | 1 - mm/page_io.c | 2 +- mm/shmem.c | 2 +- net/ipv4/raw.c | 1 - sound/core/pcm_native.c | 2 +- 63 files changed, 55 insertions(+), 86 deletions(-) (limited to 'kernel') diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 4c8008dd938e..ad66b07f742e 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include "hypfs.h" diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 297110c12635..9c4fd7a8e2e5 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include diff --git a/drivers/char/tile-srom.c b/drivers/char/tile-srom.c index 02e76ac6d282..69f6b4acc377 100644 --- a/drivers/char/tile-srom.c +++ b/drivers/char/tile-srom.c @@ -27,7 +27,6 @@ #include /* size_t */ #include #include /* O_ACCMODE */ -#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 6d7f453b4d05..aed8afee56da 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -40,7 +40,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index b15e34eeef68..826c17ee4c7d 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/amthif.c b/drivers/misc/mei/amthif.c index c4cb9a984a5f..40ea639fa413 100644 --- a/drivers/misc/mei/amthif.c +++ b/drivers/misc/mei/amthif.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/main.c b/drivers/misc/mei/main.c index 3c019c0e60eb..47680c84801c 100644 --- a/drivers/misc/mei/main.c +++ b/drivers/misc/mei/main.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index bd3039ab8f98..af44ee26075d 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0cbc1fb45f10..c78a6f7bbc20 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -33,7 +33,6 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include #include #include -#include #include #include #include @@ -51,6 +50,7 @@ static int sg_version_num = 30536; /* 2 digits for each component */ #include #include #include +#include #include "scsi.h" #include diff --git a/drivers/staging/unisys/include/timskmod.h b/drivers/staging/unisys/include/timskmod.h index 4019a0d63645..52648d4d9922 100644 --- a/drivers/staging/unisys/include/timskmod.h +++ b/drivers/staging/unisys/include/timskmod.h @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index b64538b498dc..a12315a78248 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index a4a80694f607..662ef2c1c62b 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index eb14e055ea83..ff1a5bac4200 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include diff --git a/fs/affs/file.c b/fs/affs/file.c index d2468bf95669..33eaa67bb026 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -12,7 +12,7 @@ * affs regular file handling primitives */ -#include +#include #include "affs.h" static struct buffer_head *affs_get_extblock_slow(struct inode *inode, u32 ext); diff --git a/fs/afs/write.c b/fs/afs/write.c index c13cb08964ed..0714abcd7f32 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -14,7 +14,6 @@ #include #include #include -#include #include "internal.h" static int afs_write_back_from_locked_page(struct afs_writeback *wb, diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 90bc079d9982..fdcb4d69f430 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include "bfs.h" diff --git a/fs/block_dev.c b/fs/block_dev.c index 975266be67d3..2e522aed6584 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include "internal.h" diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b78bbbac900d..69c9508d2c7e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -32,6 +31,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 54bcf639d1cf..b214ab178f3a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -43,6 +42,7 @@ #include #include #include +#include #include "ctree.h" #include "disk-io.h" #include "transaction.h" diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 081c4e3f9e49..98e257c1b5b1 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -7,7 +7,6 @@ #include #include #include -#include #include #include "super.h" diff --git a/fs/direct-io.c b/fs/direct-io.c index c38b460776e6..6fb00e3f1059 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -37,7 +37,6 @@ #include #include #include -#include /* * How many user pages to map in one call to get_user_pages(). This determines diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index a36da8841e0c..273d36e3f0c0 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -31,7 +31,6 @@ #include #include #include -#include #include "ecryptfs_kernel.h" /** diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 6434bc000125..df9d6afbc5d5 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2c6ccc49ba27..db07ffbe7c85 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include "ext3.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 33a09da16c9c..598abbbe6786 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -23,9 +23,9 @@ #include #include #include -#include #include #include +#include #include "ext4.h" #include "ext4_jbd2.h" #include "xattr.h" diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 6b9878a24182..8611640856d3 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -20,9 +20,9 @@ * (sct@redhat.com), 1993, 1998 */ -#include #include "ext4_jbd2.h" #include "truncate.h" +#include #include diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 85404f15e53a..6325d2c1a65c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include "ext4_jbd2.h" diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index b24a2541a9ba..464984261e69 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 985ed023a750..497f8515d205 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -12,12 +12,12 @@ #include #include #include -#include #include #include #include #include #include +#include #include "f2fs.h" #include "node.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 497c7c5263c7..8521207de229 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 28d0c7abba1c..b3fa05032234 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -48,6 +47,7 @@ #include #include #include +#include #include "fuse_i.h" diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ed19a7d622fa..8c92c727ddd6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -19,7 +19,6 @@ #include #include #include -#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a5c5e38b3ff8..ff102cbf16ea 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -15,8 +15,8 @@ #include #include #include -#include #include +#include static const struct file_operations fuse_direct_io_file_operations; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 4ad4f94edebe..fe6634d25d1d 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include "gfs2.h" diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 3e32bb8e2d7e..f6fc412b1100 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include "gfs2.h" diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index d0929bc81782..98d4ea45bb70 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "hfs_fs.h" #include "btree.h" diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 0cf786f2d046..f541196d4ee9 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include "hfsplus_fs.h" #include "hfsplus_raw.h" diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index bd3df1ca3c9b..3197aed10614 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -22,8 +22,8 @@ #include #include #include +#include #include -#include #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_filsys.h" diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 94712fc781fa..5d8b89cb13c0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 8b5969538f39..ab4987bc637f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "nilfs.h" #include "btnode.h" #include "segment.h" diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 1da9b2d184dc..f16f2d8401fe 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 898b9949d363..1d0c21df0d80 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -28,7 +28,6 @@ #include #include #include -#include #include "aops.h" #include "attrib.h" diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 44db1808cdb5..e1bf18c5d25e 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -29,6 +29,7 @@ #include #include #include +#include #include diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index 6cae155d54df..dd59599b022d 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -22,7 +22,7 @@ #ifndef OCFS2_AOPS_H #define OCFS2_AOPS_H -#include +#include handle_t *ocfs2_start_walk_page_trans(struct inode *inode, struct page *page, diff --git a/fs/pipe.c b/fs/pipe.c index 21981e58e2a6..2d084f2d0b83 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/fs/read_write.c b/fs/read_write.c index 76e324e8ce8d..99a6ef946d01 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index e72401e1f995..9312b7842e03 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include int reiserfs_commit_write(struct file *f, struct page *page, unsigned from, unsigned to); diff --git a/fs/splice.c b/fs/splice.c index 7968da96bebb..4bbfa95b5bfe 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -32,7 +32,6 @@ #include #include #include -#include #include "internal.h" /* diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index e627c0acf626..c3d15fe83403 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -50,7 +50,6 @@ */ #include "ubifs.h" -#include #include #include #include diff --git a/fs/udf/file.c b/fs/udf/file.c index 9c0b6da9dbb3..7f885cc8b0b7 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a445d599098d..9c1fbd23913d 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -38,7 +38,7 @@ #include #include #include -#include +#include #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3a9b7a1b8704..4f8cdc59bc38 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -31,7 +31,6 @@ #include "xfs_bmap.h" #include "xfs_bmap_util.h" #include "xfs_bmap_btree.h" -#include #include #include #include diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 1cdba95c78cb..f527618cb42b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -37,7 +37,6 @@ #include "xfs_log.h" #include "xfs_icache.h" -#include #include #include #include diff --git a/include/linux/aio.h b/include/linux/aio.h index 5c40b61285ac..9eb42dbc5582 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -1,52 +1,23 @@ #ifndef __LINUX__AIO_H #define __LINUX__AIO_H -#include -#include #include -#include -#include - -#include struct kioctx; struct kiocb; +struct mm_struct; #define KIOCB_KEY 0 typedef int (kiocb_cancel_fn)(struct kiocb *); -#define IOCB_EVENTFD (1 << 0) - -struct kiocb { - struct file *ki_filp; - loff_t ki_pos; - void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); - void *private; - int ki_flags; -}; - -static inline bool is_sync_kiocb(struct kiocb *kiocb) -{ - return kiocb->ki_complete == NULL; -} - -static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) -{ - *kiocb = (struct kiocb) { - .ki_filp = filp, - }; -} - /* prototypes */ #ifdef CONFIG_AIO -struct mm_struct; extern void exit_aio(struct mm_struct *mm); extern long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else -struct mm_struct; static inline void exit_aio(struct mm_struct *mm) { } static inline long do_io_submit(aio_context_t ctx_id, long nr, struct iocb __user * __user *iocbpp, diff --git a/include/linux/fs.h b/include/linux/fs.h index 447932aed1e1..48c1472bde4a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -314,6 +314,28 @@ struct page; struct address_space; struct writeback_control; +#define IOCB_EVENTFD (1 << 0) + +struct kiocb { + struct file *ki_filp; + loff_t ki_pos; + void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); + void *private; + int ki_flags; +}; + +static inline bool is_sync_kiocb(struct kiocb *kiocb) +{ + return kiocb->ki_complete == NULL; +} + +static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp) +{ + *kiocb = (struct kiocb) { + .ki_filp = filp, + }; +} + /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet diff --git a/include/net/sock.h b/include/net/sock.h index ab186b1d31ff..71c1300025e2 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -57,7 +57,6 @@ #include #include #include -#include #include #include diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 60b2aa2a2da9..40d50cc4c686 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -46,6 +45,7 @@ #include #include #include +#include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 88ea2d6e0031..83d907afb4a6 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -19,6 +19,7 @@ */ #include +#include #include #include #include diff --git a/mm/filemap.c b/mm/filemap.c index ad7242043bdb..876f4e6f3ed6 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/page_io.c b/mm/page_io.c index 7ef21577856c..a96c8562d835 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -20,8 +20,8 @@ #include #include #include -#include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, diff --git a/mm/shmem.c b/mm/shmem.c index a63031fa3e0c..944b94079bb0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -31,7 +31,7 @@ #include #include #include -#include +#include static struct vfsmount *shm_mnt; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index f027a708b7e0..4a356b7c081b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index b03a638b420c..9ecff240a39b 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include /* * Compatibility -- cgit v1.3-8-gc7d7 From 8710e914027e4f64058ebbf0501cc6db3cc8454f Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:22 -0700 Subject: timers, sched/clock: Match scope of read and write seqcounts Currently the scope of the raw_write_seqcount_begin/end() in sched_clock_register() far exceeds the scope of the read section in sched_clock(). This gives the impression of safety during cursory review but achieves little. Note that this is likely to be a latent issue at present because sched_clock_register() is typically called before we enable interrupts, however the issue does risk bugs being needlessly introduced as the code evolves. This patch fixes the problem by increasing the scope of the read locking performed by sched_clock() to cover all data modified by sched_clock_register. We also improve clarity by moving writes to struct clock_data that do not impact sched_clock() outside of the critical section. Signed-off-by: Daniel Thompson [ Reworked it slightly to apply to tip/timers/core] Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index ca3bc5c7027c..1751e956add9 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -58,23 +58,21 @@ static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) unsigned long long notrace sched_clock(void) { - u64 epoch_ns; - u64 epoch_cyc; - u64 cyc; + u64 cyc, res; unsigned long seq; - if (cd.suspended) - return cd.epoch_ns; - do { seq = raw_read_seqcount_begin(&cd.seq); - epoch_cyc = cd.epoch_cyc; - epoch_ns = cd.epoch_ns; + + res = cd.epoch_ns; + if (!cd.suspended) { + cyc = read_sched_clock(); + cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; + res += cyc_to_ns(cyc, cd.mult, cd.shift); + } } while (read_seqcount_retry(&cd.seq, seq)); - cyc = read_sched_clock(); - cyc = (cyc - epoch_cyc) & sched_clock_mask; - return epoch_ns + cyc_to_ns(cyc, cd.mult, cd.shift); + return res; } /* @@ -111,7 +109,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; - ktime_t new_wrap_kt; unsigned long r; char r_unit; @@ -124,10 +121,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); + cd.rate = rate; /* calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); - new_wrap_kt = ns_to_ktime(wrap); + cd.wrap_kt = ns_to_ktime(wrap); /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); @@ -138,8 +136,6 @@ void __init sched_clock_register(u64 (*read)(void), int bits, raw_write_seqcount_begin(&cd.seq); read_sched_clock = read; sched_clock_mask = new_mask; - cd.rate = rate; - cd.wrap_kt = new_wrap_kt; cd.mult = new_mult; cd.shift = new_shift; cd.epoch_cyc = new_epoch; -- cgit v1.3-8-gc7d7 From cf7c9c170787d6870af54684822f58acc00a966c Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:23 -0700 Subject: timers, sched/clock: Optimize cache line usage Currently sched_clock(), a very hot code path, is not optimized to minimise its cache profile. In particular: 1. cd is not ____cacheline_aligned, 2. struct clock_data does not distinguish between hotpath and coldpath data, reducing locality of reference in the hotpath, 3. Some hotpath data is missing from struct clock_data and is marked __read_mostly (which more or less guarantees it will not share a cache line with cd). This patch corrects these problems by extracting all hotpath data into a separate structure and using ____cacheline_aligned to ensure the hotpath uses a single (64 byte) cache line. Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 112 +++++++++++++++++++++++++++++++--------------- 1 file changed, 77 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 1751e956add9..872e0685d1fb 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -18,28 +18,59 @@ #include #include -struct clock_data { - ktime_t wrap_kt; +/** + * struct clock_read_data - data required to read from sched_clock + * + * @epoch_ns: sched_clock value at last update + * @epoch_cyc: Clock cycle value at last update + * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit + * clocks + * @read_sched_clock: Current clock source (or dummy source when suspended) + * @mult: Multipler for scaled math conversion + * @shift: Shift value for scaled math conversion + * @suspended: Flag to indicate if the clock is suspended (stopped) + * + * Care must be taken when updating this structure; it is read by + * some very hot code paths. It occupies <=48 bytes and, when combined + * with the seqcount used to synchronize access, comfortably fits into + * a 64 byte cache line. + */ +struct clock_read_data { u64 epoch_ns; u64 epoch_cyc; - seqcount_t seq; - unsigned long rate; + u64 sched_clock_mask; + u64 (*read_sched_clock)(void); u32 mult; u32 shift; bool suspended; }; +/** + * struct clock_data - all data needed for sched_clock (including + * registration of a new clock source) + * + * @seq: Sequence counter for protecting updates. + * @read_data: Data required to read from sched_clock. + * @wrap_kt: Duration for which clock can run before wrapping + * @rate: Tick rate of the registered clock + * @actual_read_sched_clock: Registered clock read function + * + * The ordering of this structure has been chosen to optimize cache + * performance. In particular seq and read_data (combined) should fit + * into a single 64 byte cache line. + */ +struct clock_data { + seqcount_t seq; + struct clock_read_data read_data; + ktime_t wrap_kt; + unsigned long rate; +}; + static struct hrtimer sched_clock_timer; static int irqtime = -1; core_param(irqtime, irqtime, int, 0400); -static struct clock_data cd = { - .mult = NSEC_PER_SEC / HZ, -}; - -static u64 __read_mostly sched_clock_mask; - static u64 notrace jiffy_sched_clock_read(void) { /* @@ -49,7 +80,10 @@ static u64 notrace jiffy_sched_clock_read(void) return (u64)(jiffies - INITIAL_JIFFIES); } -static u64 __read_mostly (*read_sched_clock)(void) = jiffy_sched_clock_read; +static struct clock_data cd ____cacheline_aligned = { + .read_data = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, +}; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) { @@ -60,15 +94,16 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; + struct clock_read_data *rd = &cd.read_data; do { seq = raw_read_seqcount_begin(&cd.seq); - res = cd.epoch_ns; - if (!cd.suspended) { - cyc = read_sched_clock(); - cyc = (cyc - cd.epoch_cyc) & sched_clock_mask; - res += cyc_to_ns(cyc, cd.mult, cd.shift); + res = rd->epoch_ns; + if (!rd->suspended) { + cyc = rd->read_sched_clock(); + cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; + res += cyc_to_ns(cyc, rd->mult, rd->shift); } } while (read_seqcount_retry(&cd.seq, seq)); @@ -83,16 +118,17 @@ static void notrace update_sched_clock(void) unsigned long flags; u64 cyc; u64 ns; + struct clock_read_data *rd = &cd.read_data; - cyc = read_sched_clock(); - ns = cd.epoch_ns + - cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift); raw_local_irq_save(flags); raw_write_seqcount_begin(&cd.seq); - cd.epoch_ns = ns; - cd.epoch_cyc = cyc; + rd->epoch_ns = ns; + rd->epoch_cyc = cyc; raw_write_seqcount_end(&cd.seq); raw_local_irq_restore(flags); } @@ -111,6 +147,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; + struct clock_read_data *rd = &cd.read_data; if (cd.rate > rate) return; @@ -129,17 +166,18 @@ void __init sched_clock_register(u64 (*read)(void), int bits, /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = read_sched_clock(); - ns = cd.epoch_ns + cyc_to_ns((cyc - cd.epoch_cyc) & sched_clock_mask, - cd.mult, cd.shift); + cyc = rd->read_sched_clock(); + ns = rd->epoch_ns + + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, + rd->mult, rd->shift); raw_write_seqcount_begin(&cd.seq); - read_sched_clock = read; - sched_clock_mask = new_mask; - cd.mult = new_mult; - cd.shift = new_shift; - cd.epoch_cyc = new_epoch; - cd.epoch_ns = ns; + rd->read_sched_clock = read; + rd->sched_clock_mask = new_mask; + rd->mult = new_mult; + rd->shift = new_shift; + rd->epoch_cyc = new_epoch; + rd->epoch_ns = ns; raw_write_seqcount_end(&cd.seq); r = rate; @@ -171,7 +209,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (read_sched_clock == jiffy_sched_clock_read) + if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); @@ -187,17 +225,21 @@ void __init sched_clock_postinit(void) static int sched_clock_suspend(void) { + struct clock_read_data *rd = &cd.read_data; + update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - cd.suspended = true; + rd->suspended = true; return 0; } static void sched_clock_resume(void) { - cd.epoch_cyc = read_sched_clock(); + struct clock_read_data *rd = &cd.read_data; + + rd->epoch_cyc = rd->read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - cd.suspended = false; + rd->suspended = false; } static struct syscore_ops sched_clock_ops = { -- cgit v1.3-8-gc7d7 From 13dbeb384d2d3aa555ea48d511e8cb110bd172e0 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:24 -0700 Subject: timers, sched/clock: Remove suspend from clock_read_data() Currently cd.read_data.suspended is read by the hotpath function sched_clock(). This variable need not be accessed on the hotpath. In fact, once it is removed, we can remove the conditional branches from sched_clock() and install a dummy read_sched_clock function to suspend the clock. The new master copy of the function pointer (actual_read_sched_clock) is introduced and is used for all reads of the clock hardware except those within sched_clock itself. Suggested-by: Thomas Gleixner Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-4-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 872e0685d1fb..52ea5d976393 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -28,10 +28,9 @@ * @read_sched_clock: Current clock source (or dummy source when suspended) * @mult: Multipler for scaled math conversion * @shift: Shift value for scaled math conversion - * @suspended: Flag to indicate if the clock is suspended (stopped) * * Care must be taken when updating this structure; it is read by - * some very hot code paths. It occupies <=48 bytes and, when combined + * some very hot code paths. It occupies <=40 bytes and, when combined * with the seqcount used to synchronize access, comfortably fits into * a 64 byte cache line. */ @@ -42,7 +41,6 @@ struct clock_read_data { u64 (*read_sched_clock)(void); u32 mult; u32 shift; - bool suspended; }; /** @@ -64,6 +62,7 @@ struct clock_data { struct clock_read_data read_data; ktime_t wrap_kt; unsigned long rate; + u64 (*actual_read_sched_clock)(void); }; static struct hrtimer sched_clock_timer; @@ -83,6 +82,8 @@ static u64 notrace jiffy_sched_clock_read(void) static struct clock_data cd ____cacheline_aligned = { .read_data = { .mult = NSEC_PER_SEC / HZ, .read_sched_clock = jiffy_sched_clock_read, }, + .actual_read_sched_clock = jiffy_sched_clock_read, + }; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -99,12 +100,9 @@ unsigned long long notrace sched_clock(void) do { seq = raw_read_seqcount_begin(&cd.seq); - res = rd->epoch_ns; - if (!rd->suspended) { - cyc = rd->read_sched_clock(); - cyc = (cyc - rd->epoch_cyc) & rd->sched_clock_mask; - res += cyc_to_ns(cyc, rd->mult, rd->shift); - } + cyc = (rd->read_sched_clock() - rd->epoch_cyc) & + rd->sched_clock_mask; + res = rd->epoch_ns + cyc_to_ns(cyc, rd->mult, rd->shift); } while (read_seqcount_retry(&cd.seq, seq)); return res; @@ -120,7 +118,7 @@ static void notrace update_sched_clock(void) u64 ns; struct clock_read_data *rd = &cd.read_data; - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); @@ -166,10 +164,11 @@ void __init sched_clock_register(u64 (*read)(void), int bits, /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); - cyc = rd->read_sched_clock(); + cyc = cd.actual_read_sched_clock(); ns = rd->epoch_ns + cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, rd->mult, rd->shift); + cd.actual_read_sched_clock = read; raw_write_seqcount_begin(&cd.seq); rd->read_sched_clock = read; @@ -209,7 +208,7 @@ void __init sched_clock_postinit(void) * If no sched_clock function has been provided at that point, * make it the final one one. */ - if (cd.read_data.read_sched_clock == jiffy_sched_clock_read) + if (cd.actual_read_sched_clock == jiffy_sched_clock_read) sched_clock_register(jiffy_sched_clock_read, BITS_PER_LONG, HZ); update_sched_clock(); @@ -223,13 +222,24 @@ void __init sched_clock_postinit(void) hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); } +/* + * Clock read function for use when the clock is suspended. + * + * This function makes it appear to sched_clock() as if the clock + * stopped counting at its last update. + */ +static u64 notrace suspended_sched_clock_read(void) +{ + return cd.read_data.epoch_cyc; +} + static int sched_clock_suspend(void) { struct clock_read_data *rd = &cd.read_data; update_sched_clock(); hrtimer_cancel(&sched_clock_timer); - rd->suspended = true; + rd->read_sched_clock = suspended_sched_clock_read; return 0; } @@ -237,9 +247,9 @@ static void sched_clock_resume(void) { struct clock_read_data *rd = &cd.read_data; - rd->epoch_cyc = rd->read_sched_clock(); + rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); - rd->suspended = false; + rd->read_sched_clock = cd.actual_read_sched_clock; } static struct syscore_ops sched_clock_ops = { -- cgit v1.3-8-gc7d7 From 9fee69a8c8070b38b558161a3f18bd5e2b664682 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:25 -0700 Subject: timers, sched/clock: Remove redundant notrace from update function Currently update_sched_clock() is marked as notrace but this function is not called by ftrace. This is trivially fixed by removing the mark up. Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-5-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 52ea5d976393..8adb9d0c969a 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -111,7 +111,7 @@ unsigned long long notrace sched_clock(void) /* * Atomically update the sched_clock epoch. */ -static void notrace update_sched_clock(void) +static void update_sched_clock(void) { unsigned long flags; u64 cyc; -- cgit v1.3-8-gc7d7 From 1809bfa44e1019e397fabaa6f2349bb7237e57a4 Mon Sep 17 00:00:00 2001 From: Daniel Thompson Date: Thu, 26 Mar 2015 12:23:26 -0700 Subject: timers, sched/clock: Avoid deadlock during read from NMI Currently it is possible for an NMI (or FIQ on ARM) to come in and read sched_clock() whilst update_sched_clock() has locked the seqcount for writing. This results in the NMI handler locking up when it calls raw_read_seqcount_begin(). This patch fixes the NMI safety issues by providing banked clock data. This is a similar approach to the one used in Thomas Gleixner's 4396e058c52e("timekeeping: Provide fast and NMI safe access to CLOCK_MONOTONIC"). Suggested-by: Stephen Boyd Signed-off-by: Daniel Thompson Signed-off-by: John Stultz Reviewed-by: Stephen Boyd Acked-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Link: http://lkml.kernel.org/r/1427397806-20889-6-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 103 ++++++++++++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 35 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index 8adb9d0c969a..eeea1e950b72 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -47,19 +47,20 @@ struct clock_read_data { * struct clock_data - all data needed for sched_clock (including * registration of a new clock source) * - * @seq: Sequence counter for protecting updates. + * @seq: Sequence counter for protecting updates. The lowest + * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. * @wrap_kt: Duration for which clock can run before wrapping * @rate: Tick rate of the registered clock * @actual_read_sched_clock: Registered clock read function * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data (combined) should fit + * performance. In particular seq and read_data[0] (combined) should fit * into a single 64 byte cache line. */ struct clock_data { seqcount_t seq; - struct clock_read_data read_data; + struct clock_read_data read_data[2]; ktime_t wrap_kt; unsigned long rate; u64 (*actual_read_sched_clock)(void); @@ -80,10 +81,9 @@ static u64 notrace jiffy_sched_clock_read(void) } static struct clock_data cd ____cacheline_aligned = { - .read_data = { .mult = NSEC_PER_SEC / HZ, - .read_sched_clock = jiffy_sched_clock_read, }, + .read_data[0] = { .mult = NSEC_PER_SEC / HZ, + .read_sched_clock = jiffy_sched_clock_read, }, .actual_read_sched_clock = jiffy_sched_clock_read, - }; static inline u64 notrace cyc_to_ns(u64 cyc, u32 mult, u32 shift) @@ -95,10 +95,11 @@ unsigned long long notrace sched_clock(void) { u64 cyc, res; unsigned long seq; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd; do { - seq = raw_read_seqcount_begin(&cd.seq); + seq = raw_read_seqcount(&cd.seq); + rd = cd.read_data + (seq & 1); cyc = (rd->read_sched_clock() - rd->epoch_cyc) & rd->sched_clock_mask; @@ -108,27 +109,51 @@ unsigned long long notrace sched_clock(void) return res; } +/* + * Updating the data required to read the clock. + * + * sched_clock will never observe mis-matched data even if called from + * an NMI. We do this by maintaining an odd/even copy of the data and + * steering sched_clock to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock as much + * as possible the system reverts back to the even copy when the update + * completes; the odd copy is used *only* during an update. + */ +static void update_clock_read_data(struct clock_read_data *rd) +{ + /* update the backup (odd) copy with the new data */ + cd.read_data[1] = *rd; + + /* steer readers towards the odd copy */ + raw_write_seqcount_latch(&cd.seq); + + /* now its safe for us to update the normal (even) copy */ + cd.read_data[0] = *rd; + + /* switch readers back to the even copy */ + raw_write_seqcount_latch(&cd.seq); +} + /* * Atomically update the sched_clock epoch. */ static void update_sched_clock(void) { - unsigned long flags; u64 cyc; u64 ns; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; + + rd = cd.read_data[0]; cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); - - raw_local_irq_save(flags); - raw_write_seqcount_begin(&cd.seq); - rd->epoch_ns = ns; - rd->epoch_cyc = cyc; - raw_write_seqcount_end(&cd.seq); - raw_local_irq_restore(flags); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); + + rd.epoch_ns = ns; + rd.epoch_cyc = cyc; + + update_clock_read_data(&rd); } static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) @@ -145,7 +170,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, u32 new_mult, new_shift; unsigned long r; char r_unit; - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data rd; if (cd.rate > rate) return; @@ -162,22 +187,23 @@ void __init sched_clock_register(u64 (*read)(void), int bits, wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); cd.wrap_kt = ns_to_ktime(wrap); + rd = cd.read_data[0]; + /* update epoch for new counter and update epoch_ns from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd->epoch_ns + - cyc_to_ns((cyc - rd->epoch_cyc) & rd->sched_clock_mask, - rd->mult, rd->shift); + ns = rd.epoch_ns + + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, + rd.mult, rd.shift); cd.actual_read_sched_clock = read; - raw_write_seqcount_begin(&cd.seq); - rd->read_sched_clock = read; - rd->sched_clock_mask = new_mask; - rd->mult = new_mult; - rd->shift = new_shift; - rd->epoch_cyc = new_epoch; - rd->epoch_ns = ns; - raw_write_seqcount_end(&cd.seq); + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd); r = rate; if (r >= 4000000) { @@ -227,15 +253,22 @@ void __init sched_clock_postinit(void) * * This function makes it appear to sched_clock() as if the clock * stopped counting at its last update. + * + * This function must only be called from the critical + * section in sched_clock(). It relies on the read_seqcount_retry() + * at the end of the critical section to be sure we observe the + * correct copy of epoch_cyc. */ static u64 notrace suspended_sched_clock_read(void) { - return cd.read_data.epoch_cyc; + unsigned long seq = raw_read_seqcount(&cd.seq); + + return cd.read_data[seq & 1].epoch_cyc; } static int sched_clock_suspend(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0]; update_sched_clock(); hrtimer_cancel(&sched_clock_timer); @@ -245,7 +278,7 @@ static int sched_clock_suspend(void) static void sched_clock_resume(void) { - struct clock_read_data *rd = &cd.read_data; + struct clock_read_data *rd = &cd.read_data[0]; rd->epoch_cyc = cd.actual_read_sched_clock(); hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); -- cgit v1.3-8-gc7d7 From 32fea568aec5b73ae27253125522b5c2a970a1f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 27 Mar 2015 07:08:06 +0100 Subject: timers, sched/clock: Clean up the code a bit Trivial cleanups, to improve the readability of the generic sched_clock() code: - Improve and standardize comments - Standardize the coding style - Use vertical spacing where appropriate - etc. No code changed: md5: 19a053b31e0c54feaeff1492012b019a sched_clock.o.before.asm 19a053b31e0c54feaeff1492012b019a sched_clock.o.after.asm Cc: Catalin Marinas Cc: Daniel Thompson Cc: John Stultz Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Russell King Cc: Stephen Boyd Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Ingo Molnar --- kernel/time/sched_clock.c | 107 ++++++++++++++++++++++++---------------------- 1 file changed, 56 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index eeea1e950b72..a26036d37a38 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -1,5 +1,6 @@ /* - * sched_clock.c: support for extending counters to full 64-bit ns counter + * sched_clock.c: Generic sched_clock() support, to extend low level + * hardware time counters to full 64-bit ns values. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -19,15 +20,15 @@ #include /** - * struct clock_read_data - data required to read from sched_clock + * struct clock_read_data - data required to read from sched_clock() * - * @epoch_ns: sched_clock value at last update - * @epoch_cyc: Clock cycle value at last update + * @epoch_ns: sched_clock() value at last update + * @epoch_cyc: Clock cycle value at last update. * @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit - * clocks - * @read_sched_clock: Current clock source (or dummy source when suspended) - * @mult: Multipler for scaled math conversion - * @shift: Shift value for scaled math conversion + * clocks. + * @read_sched_clock: Current clock source (or dummy source when suspended). + * @mult: Multipler for scaled math conversion. + * @shift: Shift value for scaled math conversion. * * Care must be taken when updating this structure; it is read by * some very hot code paths. It occupies <=40 bytes and, when combined @@ -44,25 +45,26 @@ struct clock_read_data { }; /** - * struct clock_data - all data needed for sched_clock (including + * struct clock_data - all data needed for sched_clock() (including * registration of a new clock source) * * @seq: Sequence counter for protecting updates. The lowest * bit is the index for @read_data. * @read_data: Data required to read from sched_clock. - * @wrap_kt: Duration for which clock can run before wrapping - * @rate: Tick rate of the registered clock - * @actual_read_sched_clock: Registered clock read function + * @wrap_kt: Duration for which clock can run before wrapping. + * @rate: Tick rate of the registered clock. + * @actual_read_sched_clock: Registered hardware level clock read function. * * The ordering of this structure has been chosen to optimize cache - * performance. In particular seq and read_data[0] (combined) should fit - * into a single 64 byte cache line. + * performance. In particular 'seq' and 'read_data[0]' (combined) should fit + * into a single 64-byte cache line. */ struct clock_data { - seqcount_t seq; - struct clock_read_data read_data[2]; - ktime_t wrap_kt; - unsigned long rate; + seqcount_t seq; + struct clock_read_data read_data[2]; + ktime_t wrap_kt; + unsigned long rate; + u64 (*actual_read_sched_clock)(void); }; @@ -112,10 +114,10 @@ unsigned long long notrace sched_clock(void) /* * Updating the data required to read the clock. * - * sched_clock will never observe mis-matched data even if called from + * sched_clock() will never observe mis-matched data even if called from * an NMI. We do this by maintaining an odd/even copy of the data and - * steering sched_clock to one or the other using a sequence counter. - * In order to preserve the data cache profile of sched_clock as much + * steering sched_clock() to one or the other using a sequence counter. + * In order to preserve the data cache profile of sched_clock() as much * as possible the system reverts back to the even copy when the update * completes; the odd copy is used *only* during an update. */ @@ -135,7 +137,7 @@ static void update_clock_read_data(struct clock_read_data *rd) } /* - * Atomically update the sched_clock epoch. + * Atomically update the sched_clock() epoch. */ static void update_sched_clock(void) { @@ -146,9 +148,7 @@ static void update_sched_clock(void) rd = cd.read_data[0]; cyc = cd.actual_read_sched_clock(); - ns = rd.epoch_ns + - cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, - rd.mult, rd.shift); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); rd.epoch_ns = ns; rd.epoch_cyc = cyc; @@ -160,11 +160,12 @@ static enum hrtimer_restart sched_clock_poll(struct hrtimer *hrt) { update_sched_clock(); hrtimer_forward_now(hrt, cd.wrap_kt); + return HRTIMER_RESTART; } -void __init sched_clock_register(u64 (*read)(void), int bits, - unsigned long rate) +void __init +sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) { u64 res, wrap, new_mask, new_epoch, cyc, ns; u32 new_mult, new_shift; @@ -177,51 +178,53 @@ void __init sched_clock_register(u64 (*read)(void), int bits, WARN_ON(!irqs_disabled()); - /* calculate the mult/shift to convert counter ticks to ns. */ + /* Calculate the mult/shift to convert counter ticks to ns. */ clocks_calc_mult_shift(&new_mult, &new_shift, rate, NSEC_PER_SEC, 3600); new_mask = CLOCKSOURCE_MASK(bits); cd.rate = rate; - /* calculate how many nanosecs until we risk wrapping */ + /* Calculate how many nanosecs until we risk wrapping */ wrap = clocks_calc_max_nsecs(new_mult, new_shift, 0, new_mask, NULL); cd.wrap_kt = ns_to_ktime(wrap); rd = cd.read_data[0]; - /* update epoch for new counter and update epoch_ns from old counter*/ + /* Update epoch for new counter and update 'epoch_ns' from old counter*/ new_epoch = read(); cyc = cd.actual_read_sched_clock(); - ns = rd.epoch_ns + - cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, - rd.mult, rd.shift); + ns = rd.epoch_ns + cyc_to_ns((cyc - rd.epoch_cyc) & rd.sched_clock_mask, rd.mult, rd.shift); cd.actual_read_sched_clock = read; - rd.read_sched_clock = read; - rd.sched_clock_mask = new_mask; - rd.mult = new_mult; - rd.shift = new_shift; - rd.epoch_cyc = new_epoch; - rd.epoch_ns = ns; + rd.read_sched_clock = read; + rd.sched_clock_mask = new_mask; + rd.mult = new_mult; + rd.shift = new_shift; + rd.epoch_cyc = new_epoch; + rd.epoch_ns = ns; + update_clock_read_data(&rd); r = rate; if (r >= 4000000) { r /= 1000000; r_unit = 'M'; - } else if (r >= 1000) { - r /= 1000; - r_unit = 'k'; - } else - r_unit = ' '; - - /* calculate the ns resolution of this counter */ + } else { + if (r >= 1000) { + r /= 1000; + r_unit = 'k'; + } else { + r_unit = ' '; + } + } + + /* Calculate the ns resolution of this counter */ res = cyc_to_ns(1ULL, new_mult, new_shift); pr_info("sched_clock: %u bits at %lu%cHz, resolution %lluns, wraps every %lluns\n", bits, r, r_unit, res, wrap); - /* Enable IRQ time accounting if we have a fast enough sched_clock */ + /* Enable IRQ time accounting if we have a fast enough sched_clock() */ if (irqtime > 0 || (irqtime == -1 && rate >= 1000000)) enable_sched_clock_irqtime(); @@ -231,7 +234,7 @@ void __init sched_clock_register(u64 (*read)(void), int bits, void __init sched_clock_postinit(void) { /* - * If no sched_clock function has been provided at that point, + * If no sched_clock() function has been provided at that point, * make it the final one one. */ if (cd.actual_read_sched_clock == jiffy_sched_clock_read) @@ -257,7 +260,7 @@ void __init sched_clock_postinit(void) * This function must only be called from the critical * section in sched_clock(). It relies on the read_seqcount_retry() * at the end of the critical section to be sure we observe the - * correct copy of epoch_cyc. + * correct copy of 'epoch_cyc'. */ static u64 notrace suspended_sched_clock_read(void) { @@ -273,6 +276,7 @@ static int sched_clock_suspend(void) update_sched_clock(); hrtimer_cancel(&sched_clock_timer); rd->read_sched_clock = suspended_sched_clock_read; + return 0; } @@ -286,13 +290,14 @@ static void sched_clock_resume(void) } static struct syscore_ops sched_clock_ops = { - .suspend = sched_clock_suspend, - .resume = sched_clock_resume, + .suspend = sched_clock_suspend, + .resume = sched_clock_resume, }; static int __init sched_clock_syscore_init(void) { register_syscore_ops(&sched_clock_ops); + return 0; } device_initcall(sched_clock_syscore_init); -- cgit v1.3-8-gc7d7 From 36ee28e45df50c2c8624b978335516e42d84ae1f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:04 +0100 Subject: sched: Add sched_avg::utilization_avg_contrib Add new statistics which reflect the average time a task is running on the CPU and the sum of these running time of the tasks on a runqueue. The latter is named utilization_load_avg. This patch is based on the usage metric that was proposed in the 1st versions of the per-entity load tracking patchset by Paul Turner but that has be removed afterwards. This version differs from the original one in the sense that it's not linked to task_group. The rq's utilization_load_avg will be used to check if a rq is overloaded or not instead of trying to compute how many tasks a group of CPUs can handle. Rename runnable_avg_period into avg_period as it is now used with both runnable_avg_sum and running_avg_sum. Add some descriptions of the variables to explain their differences. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-2-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 21 ++++++++++++--- kernel/sched/debug.c | 10 ++++--- kernel/sched/fair.c | 74 ++++++++++++++++++++++++++++++++++++++++----------- kernel/sched/sched.h | 8 +++++- 4 files changed, 89 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d77432e14ff..fdca05c5f812 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1115,15 +1115,28 @@ struct load_weight { }; struct sched_avg { + u64 last_runnable_update; + s64 decay_count; + /* + * utilization_avg_contrib describes the amount of time that a + * sched_entity is running on a CPU. It is based on running_avg_sum + * and is scaled in the range [0..SCHED_LOAD_SCALE]. + * load_avg_contrib described the amount of time that a sched_entity + * is runnable on a rq. It is based on both runnable_avg_sum and the + * weight of the task. + */ + unsigned long load_avg_contrib, utilization_avg_contrib; /* * These sums represent an infinite geometric series and so are bound * above by 1024/(1-y). Thus we only need a u32 to store them for all * choices of y < 1-2^(-32)*1024. + * running_avg_sum reflects the time that the sched_entity is + * effectively running on the CPU. + * runnable_avg_sum represents the amount of time a sched_entity is on + * a runqueue which includes the running time that is monitored by + * running_avg_sum. */ - u32 runnable_avg_sum, runnable_avg_period; - u64 last_runnable_update; - s64 decay_count; - unsigned long load_avg_contrib; + u32 runnable_avg_sum, avg_period, running_avg_sum; }; #ifdef CONFIG_SCHEDSTATS diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 8baaf858d25c..578ff83d1d1a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -71,7 +71,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group if (!se) { struct sched_avg *avg = &cpu_rq(cpu)->avg; P(avg->runnable_avg_sum); - P(avg->runnable_avg_period); + P(avg->avg_period); return; } @@ -94,7 +94,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); - P(se->avg.runnable_avg_period); + P(se->avg.avg_period); P(se->avg.load_avg_contrib); P(se->avg.decay_count); #endif @@ -214,6 +214,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) cfs_rq->runnable_load_avg); SEQ_printf(m, " .%-30s: %ld\n", "blocked_load_avg", cfs_rq->blocked_load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "utilization_load_avg", + cfs_rq->utilization_load_avg); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %ld\n", "tg_load_contrib", cfs_rq->tg_load_contrib); @@ -636,8 +638,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) P(se.load.weight); #ifdef CONFIG_SMP P(se.avg.runnable_avg_sum); - P(se.avg.runnable_avg_period); + P(se.avg.running_avg_sum); + P(se.avg.avg_period); P(se.avg.load_avg_contrib); + P(se.avg.utilization_avg_contrib); P(se.avg.decay_count); #endif P(policy); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ee595ef30470..414408dd6e0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -670,6 +670,7 @@ static int select_idle_sibling(struct task_struct *p, int cpu); static unsigned long task_h_load(struct task_struct *p); static inline void __update_task_entity_contrib(struct sched_entity *se); +static inline void __update_task_entity_utilization(struct sched_entity *se); /* Give new task start runnable values to heavy its load in infant time */ void init_task_runnable_average(struct task_struct *p) @@ -677,9 +678,10 @@ void init_task_runnable_average(struct task_struct *p) u32 slice; slice = sched_slice(task_cfs_rq(p), &p->se) >> 10; - p->se.avg.runnable_avg_sum = slice; - p->se.avg.runnable_avg_period = slice; + p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice; + p->se.avg.avg_period = slice; __update_task_entity_contrib(&p->se); + __update_task_entity_utilization(&p->se); } #else void init_task_runnable_average(struct task_struct *p) @@ -1684,7 +1686,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) *period = now - p->last_task_numa_placement; } else { delta = p->se.avg.runnable_avg_sum; - *period = p->se.avg.runnable_avg_period; + *period = p->se.avg.avg_period; } p->last_sum_exec_runtime = runtime; @@ -2512,7 +2514,8 @@ static u32 __compute_runnable_contrib(u64 n) */ static __always_inline int __update_entity_runnable_avg(u64 now, struct sched_avg *sa, - int runnable) + int runnable, + int running) { u64 delta, periods; u32 runnable_contrib; @@ -2538,7 +2541,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->last_runnable_update = now; /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->runnable_avg_period % 1024; + delta_w = sa->avg_period % 1024; if (delta + delta_w >= 1024) { /* period roll-over */ decayed = 1; @@ -2551,7 +2554,9 @@ static __always_inline int __update_entity_runnable_avg(u64 now, delta_w = 1024 - delta_w; if (runnable) sa->runnable_avg_sum += delta_w; - sa->runnable_avg_period += delta_w; + if (running) + sa->running_avg_sum += delta_w; + sa->avg_period += delta_w; delta -= delta_w; @@ -2561,20 +2566,26 @@ static __always_inline int __update_entity_runnable_avg(u64 now, sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum, periods + 1); - sa->runnable_avg_period = decay_load(sa->runnable_avg_period, + sa->running_avg_sum = decay_load(sa->running_avg_sum, + periods + 1); + sa->avg_period = decay_load(sa->avg_period, periods + 1); /* Efficiently calculate \sum (1..n_period) 1024*y^i */ runnable_contrib = __compute_runnable_contrib(periods); if (runnable) sa->runnable_avg_sum += runnable_contrib; - sa->runnable_avg_period += runnable_contrib; + if (running) + sa->running_avg_sum += runnable_contrib; + sa->avg_period += runnable_contrib; } /* Remainder of delta accrued against u_0` */ if (runnable) sa->runnable_avg_sum += delta; - sa->runnable_avg_period += delta; + if (running) + sa->running_avg_sum += delta; + sa->avg_period += delta; return decayed; } @@ -2591,6 +2602,8 @@ static inline u64 __synchronize_entity_decay(struct sched_entity *se) return 0; se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays); + se->avg.utilization_avg_contrib = + decay_load(se->avg.utilization_avg_contrib, decays); return decays; } @@ -2626,7 +2639,7 @@ static inline void __update_tg_runnable_avg(struct sched_avg *sa, /* The fraction of a cpu used by this cfs_rq */ contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT, - sa->runnable_avg_period + 1); + sa->avg_period + 1); contrib -= cfs_rq->tg_runnable_contrib; if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) { @@ -2679,7 +2692,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable); + __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, + runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2697,7 +2711,7 @@ static inline void __update_task_entity_contrib(struct sched_entity *se) /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight); - contrib /= (se->avg.runnable_avg_period + 1); + contrib /= (se->avg.avg_period + 1); se->avg.load_avg_contrib = scale_load(contrib); } @@ -2716,6 +2730,27 @@ static long __update_entity_load_avg_contrib(struct sched_entity *se) return se->avg.load_avg_contrib - old_contrib; } + +static inline void __update_task_entity_utilization(struct sched_entity *se) +{ + u32 contrib; + + /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */ + contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE); + contrib /= (se->avg.avg_period + 1); + se->avg.utilization_avg_contrib = scale_load(contrib); +} + +static long __update_entity_utilization_avg_contrib(struct sched_entity *se) +{ + long old_contrib = se->avg.utilization_avg_contrib; + + if (entity_is_task(se)) + __update_task_entity_utilization(se); + + return se->avg.utilization_avg_contrib - old_contrib; +} + static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq, long load_contrib) { @@ -2732,7 +2767,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) { struct cfs_rq *cfs_rq = cfs_rq_of(se); - long contrib_delta; + long contrib_delta, utilization_delta; u64 now; /* @@ -2744,18 +2779,22 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq)) + if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + cfs_rq->curr == se)) return; contrib_delta = __update_entity_load_avg_contrib(se); + utilization_delta = __update_entity_utilization_avg_contrib(se); if (!update_cfs_rq) return; - if (se->on_rq) + if (se->on_rq) { cfs_rq->runnable_load_avg += contrib_delta; - else + cfs_rq->utilization_load_avg += utilization_delta; + } else { subtract_blocked_load_contrib(cfs_rq, -contrib_delta); + } } /* @@ -2830,6 +2869,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq, } cfs_rq->runnable_load_avg += se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib; /* we force update consideration on load-balancer moves */ update_cfs_rq_blocked_load(cfs_rq, !wakeup); } @@ -2848,6 +2888,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, update_cfs_rq_blocked_load(cfs_rq, !sleep); cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib; + cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib; if (sleep) { cfs_rq->blocked_load_avg += se->avg.load_avg_contrib; se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); @@ -3185,6 +3226,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) */ update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); + update_entity_load_avg(se, 1); } update_stats_curr_start(cfs_rq, se); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c2c0d7bd5027..4c95cc2e0be2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -363,8 +363,14 @@ struct cfs_rq { * Under CFS, load is tracked on a per-entity basis and aggregated up. * This allows for the description of both thread and group usage (in * the FAIR_GROUP_SCHED case). + * runnable_load_avg is the sum of the load_avg_contrib of the + * sched_entities on the rq. + * blocked_load_avg is similar to runnable_load_avg except that its + * the blocked sched_entities on the rq. + * utilization_load_avg is the sum of the average running time of the + * sched_entities on the rq. */ - unsigned long runnable_load_avg, blocked_load_avg; + unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg; atomic64_t decay_counter; u64 last_decay; atomic_long_t removed_load; -- cgit v1.3-8-gc7d7 From 21f4486630b0bd1b6dbcc04f61836987fa54278f Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Fri, 27 Feb 2015 16:54:05 +0100 Subject: sched: Track group sched_entity usage contributions Add usage contribution tracking for group entities. Unlike se->avg.load_avg_contrib, se->avg.utilization_avg_contrib for group entities is the sum of se->avg.utilization_avg_contrib for all entities on the group runqueue. It is _not_ influenced in any way by the task group h_load. Hence it is representing the actual cpu usage of the group, not its intended load contribution which may differ significantly from the utilization on lightly utilized systems. Signed-off-by: Morten Rasmussen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-3-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/debug.c | 2 ++ kernel/sched/fair.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 578ff83d1d1a..a245c1fc6f0a 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -94,8 +94,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P(se->load.weight); #ifdef CONFIG_SMP P(se->avg.runnable_avg_sum); + P(se->avg.running_avg_sum); P(se->avg.avg_period); P(se->avg.load_avg_contrib); + P(se->avg.utilization_avg_contrib); P(se->avg.decay_count); #endif #undef PN diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 414408dd6e0c..d94a86511ae9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2747,6 +2747,9 @@ static long __update_entity_utilization_avg_contrib(struct sched_entity *se) if (entity_is_task(se)) __update_task_entity_utilization(se); + else + se->avg.utilization_avg_contrib = + group_cfs_rq(se)->utilization_load_avg; return se->avg.utilization_avg_contrib - old_contrib; } -- cgit v1.3-8-gc7d7 From a8faa8f55d48496f64d96df48298e54fd380f6af Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:06 +0100 Subject: sched: Remove frequency scaling from cpu_capacity Now that arch_scale_cpu_capacity has been introduced to scale the original capacity, the arch_scale_freq_capacity is no longer used (it was previously used by ARM arch). Remove arch_scale_freq_capacity from the computation of cpu_capacity. The frequency invariance will be handled in the load tracking and not in the CPU capacity. arch_scale_freq_capacity will be revisited for scaling load with the current frequency of the CPUs in a later patch. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-4-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d94a86511ae9..e54231fc6336 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6042,13 +6042,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) sdg->sgc->capacity_orig = capacity; - if (sched_feat(ARCH_CAPACITY)) - capacity *= arch_scale_freq_capacity(sd, cpu); - else - capacity *= default_scale_capacity(sd, cpu); - - capacity >>= SCHED_CAPACITY_SHIFT; - capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; -- cgit v1.3-8-gc7d7 From 0c1dc6b27dac883ee78392189c8e20e764d79bfa Mon Sep 17 00:00:00 2001 From: Morten Rasmussen Date: Wed, 4 Mar 2015 08:46:26 +0100 Subject: sched: Make sched entity usage tracking scale-invariant Apply frequency scale-invariance correction factor to usage tracking. Each segment of the running_avg_sum geometric series is now scaled by the current frequency so the utilization_avg_contrib of each entity will be invariant with frequency scaling. As a result, utilization_load_avg which is the sum of utilization_avg_contrib, becomes invariant too. So the usage level that is returned by get_cpu_usage(), stays relative to the max frequency as the cpu_capacity which is is compared against. Then, we want the keep the load tracking values in a 32-bit type, which implies that the max value of {runnable|running}_avg_sum must be lower than 2^32/88761=48388 (88761 is the max weigth of a task). As LOAD_AVG_MAX = 47742, arch_scale_freq_capacity() must return a value less than (48388/47742) << SCHED_CAPACITY_SHIFT = 1037 (SCHED_SCALE_CAPACITY = 1024). So we define the range to [0..SCHED_SCALE_CAPACITY] in order to avoid overflow. Signed-off-by: Morten Rasmussen Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Paul Turner Cc: Ben Segall Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425455186-13451-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e54231fc6336..7f031e454740 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2484,6 +2484,8 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } +unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -2512,7 +2514,7 @@ static u32 __compute_runnable_contrib(u64 n) * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... ) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ -static __always_inline int __update_entity_runnable_avg(u64 now, +static __always_inline int __update_entity_runnable_avg(u64 now, int cpu, struct sched_avg *sa, int runnable, int running) @@ -2520,6 +2522,7 @@ static __always_inline int __update_entity_runnable_avg(u64 now, u64 delta, periods; u32 runnable_contrib; int delta_w, decayed = 0; + unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu); delta = now - sa->last_runnable_update; /* @@ -2555,7 +2558,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += delta_w; if (running) - sa->running_avg_sum += delta_w; + sa->running_avg_sum += delta_w * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += delta_w; delta -= delta_w; @@ -2576,7 +2580,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += runnable_contrib; if (running) - sa->running_avg_sum += runnable_contrib; + sa->running_avg_sum += runnable_contrib * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += runnable_contrib; } @@ -2584,7 +2589,8 @@ static __always_inline int __update_entity_runnable_avg(u64 now, if (runnable) sa->runnable_avg_sum += delta; if (running) - sa->running_avg_sum += delta; + sa->running_avg_sum += delta * scale_freq + >> SCHED_CAPACITY_SHIFT; sa->avg_period += delta; return decayed; @@ -2692,8 +2698,8 @@ static inline void __update_group_entity_contrib(struct sched_entity *se) static inline void update_rq_runnable_avg(struct rq *rq, int runnable) { - __update_entity_runnable_avg(rq_clock_task(rq), &rq->avg, runnable, - runnable); + __update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg, + runnable, runnable); __update_tg_runnable_avg(&rq->avg, &rq->cfs); } #else /* CONFIG_FAIR_GROUP_SCHED */ @@ -2771,6 +2777,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, { struct cfs_rq *cfs_rq = cfs_rq_of(se); long contrib_delta, utilization_delta; + int cpu = cpu_of(rq_of(cfs_rq)); u64 now; /* @@ -2782,7 +2789,7 @@ static inline void update_entity_load_avg(struct sched_entity *se, else now = cfs_rq_clock_task(group_cfs_rq(se)); - if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq, + if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq, cfs_rq->curr == se)) return; -- cgit v1.3-8-gc7d7 From b5b4860d1d61ddc5308c7d492cbeaa3a6e508d7f Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:08 +0100 Subject: sched: Make scale_rt invariant with frequency The average running time of RT tasks is used to estimate the remaining compute capacity for CFS tasks. This remaining capacity is the original capacity scaled down by a factor (aka scale_rt_capacity). This estimation of available capacity must also be invariant with frequency scaling. A frequency scaling factor is applied on the running time of the RT tasks for computing scale_rt_capacity. In sched_rt_avg_update(), we now scale the RT execution time like below: rq->rt_avg += rt_delta * arch_scale_freq_capacity() >> SCHED_CAPACITY_SHIFT Then, scale_rt_capacity can be summarized by: scale_rt_capacity = SCHED_CAPACITY_SCALE * available / total with available = total - rq->rt_avg This has been been optimized in current code by: scale_rt_capacity = available / (total >> SCHED_CAPACITY_SHIFT) But we can also developed the equation like below: scale_rt_capacity = SCHED_CAPACITY_SCALE - ((rq->rt_avg << SCHED_CAPACITY_SHIFT) / total) and we can optimize the equation by removing SCHED_CAPACITY_SHIFT shift in the computation of rq->rt_avg and scale_rt_capacity(). so rq->rt_avg += rt_delta * arch_scale_freq_capacity() and scale_rt_capacity = SCHED_CAPACITY_SCALE - (rq->rt_avg / total) arch_scale_frequency_capacity() will be called in the hot path of the scheduler which implies to have a short and efficient function. As an example, arch_scale_frequency_capacity() should return a cached value that is updated periodically outside of the hot path. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-6-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 17 +++++------------ kernel/sched/sched.h | 4 +++- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7f031e454740..dc7c693f044a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6004,7 +6004,7 @@ unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) static unsigned long scale_rt_capacity(int cpu) { struct rq *rq = cpu_rq(cpu); - u64 total, available, age_stamp, avg; + u64 total, used, age_stamp, avg; s64 delta; /* @@ -6020,19 +6020,12 @@ static unsigned long scale_rt_capacity(int cpu) total = sched_avg_period() + delta; - if (unlikely(total < avg)) { - /* Ensures that capacity won't end up being negative */ - available = 0; - } else { - available = total - avg; - } + used = div_u64(avg, total); - if (unlikely((s64)total < SCHED_CAPACITY_SCALE)) - total = SCHED_CAPACITY_SCALE; + if (likely(used < SCHED_CAPACITY_SCALE)) + return SCHED_CAPACITY_SCALE - used; - total >>= SCHED_CAPACITY_SHIFT; - - return div_u64(available, total); + return 1; } static void update_cpu_capacity(struct sched_domain *sd, int cpu) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 4c95cc2e0be2..36000029f33b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1386,9 +1386,11 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); +extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { - rq->rt_avg += rt_delta; + rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq)); sched_avg_update(rq); } #else -- cgit v1.3-8-gc7d7 From ca6d75e6908efbc350d536e0b496ebdac36b20d2 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:09 +0100 Subject: sched: Add struct rq::cpu_capacity_orig This new field 'cpu_capacity_orig' reflects the original capacity of a CPU before being altered by rt tasks and/or IRQ The cpu_capacity_orig will be used: - to detect when the capacity of a CPU has been noticeably reduced so we can trig load balance to look for a CPU with better capacity. As an example, we can detect when a CPU handles a significant amount of irq (with CONFIG_IRQ_TIME_ACCOUNTING) but this CPU is seen as an idle CPU by scheduler whereas CPUs, which are really idle, are available. - evaluate the available capacity for CFS tasks Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kamalesh Babulal Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-7-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 8 +++++++- kernel/sched/sched.h | 1 + 3 files changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index feda520bd034..7022e9084624 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7216,7 +7216,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc7c693f044a..10f84c3c6769 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4363,6 +4363,11 @@ static unsigned long capacity_of(int cpu) return cpu_rq(cpu)->cpu_capacity; } +static unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6040,6 +6045,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; + cpu_rq(cpu)->cpu_capacity_orig = capacity; sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); @@ -6094,7 +6100,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_of(cpu); + capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 36000029f33b..be56dfd645b2 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -615,6 +615,7 @@ struct rq { struct sched_domain *sd; unsigned long cpu_capacity; + unsigned long cpu_capacity_orig; unsigned char idle_balance; /* For active balancing */ -- cgit v1.3-8-gc7d7 From 8bb5b00c2f90100a272b09a9d17ec7875d088aa7 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 4 Mar 2015 08:48:47 +0100 Subject: sched: Calculate CPU's usage statistic and put it into struct sg_lb_stats::group_usage Monitor the usage level of each group of each sched_domain level. The usage is the portion of cpu_capacity_orig that is currently used on a CPU or group of CPUs. We use the utilization_load_avg to evaluate the usage level of each group. The utilization_load_avg only takes into account the running time of the CFS tasks on a CPU with a maximum value of SCHED_LOAD_SCALE when the CPU is fully utilized. Nevertheless, we must cap utilization_load_avg which can be temporally greater than SCHED_LOAD_SCALE after the migration of a task on this CPU and until the metrics are stabilized. The utilization_load_avg is in the range [0..SCHED_LOAD_SCALE] to reflect the running load on the CPU whereas the available capacity for the CFS task is in the range [0..cpu_capacity_orig]. In order to test if a CPU is fully utilized by CFS tasks, we have to scale the utilization in the cpu_capacity_orig range of the CPU to get the usage of the latter. The usage can then be compared with the available capacity (ie cpu_capacity) to deduct the usage level of a CPU. The frequency scaling invariance of the usage is not taken into account in this patch, it will be solved in another patch which will deal with frequency scaling invariance on the utilization_load_avg. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Acked-by: Morten Rasmussen Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425455327-13508-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 10f84c3c6769..471193bdd4b7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4781,6 +4781,33 @@ next: done: return target; } +/* + * get_cpu_usage returns the amount of capacity of a CPU that is used by CFS + * tasks. The unit of the return value must be the one of capacity so we can + * compare the usage with the capacity of the CPU that is available for CFS + * task (ie cpu_capacity). + * cfs.utilization_load_avg is the sum of running time of runnable tasks on a + * CPU. It represents the amount of utilization of a CPU in the range + * [0..SCHED_LOAD_SCALE]. The usage of a CPU can't be higher than the full + * capacity of the CPU because it's about the running time on this CPU. + * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE + * because of unfortunate rounding in avg_period and running_load_avg or just + * after migrating tasks until the average stabilizes with the new running + * time. So we need to check that the usage stays into the range + * [0..cpu_capacity_orig] and cap if necessary. + * Without capping the usage, a group could be seen as overloaded (CPU0 usage + * at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity + */ +static int get_cpu_usage(int cpu) +{ + unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg; + unsigned long capacity = capacity_orig_of(cpu); + + if (usage >= SCHED_LOAD_SCALE) + return capacity; + + return (usage * capacity) >> SCHED_LOAD_SHIFT; +} /* * select_task_rq_fair: Select target runqueue for the waking task in domains @@ -5907,6 +5934,7 @@ struct sg_lb_stats { unsigned long sum_weighted_load; /* Weighted load of group's tasks */ unsigned long load_per_task; unsigned long group_capacity; + unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ unsigned int group_capacity_factor; unsigned int idle_cpus; @@ -6255,6 +6283,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, load = source_load(i, load_idx); sgs->group_load += load; + sgs->group_usage += get_cpu_usage(i); sgs->sum_nr_running += rq->cfs.h_nr_running; if (rq->nr_running > 1) -- cgit v1.3-8-gc7d7 From ea67821b9a3edadf602b7772a0b2a69657ced746 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:11 +0100 Subject: sched: Replace capacity_factor by usage The scheduler tries to compute how many tasks a group of CPUs can handle by assuming that a task's load is SCHED_LOAD_SCALE and a CPU's capacity is SCHED_CAPACITY_SCALE. 'struct sg_lb_stats:group_capacity_factor' divides the capacity of the group by SCHED_LOAD_SCALE to estimate how many task can run in the group. Then, it compares this value with the sum of nr_running to decide if the group is overloaded or not. But the 'group_capacity_factor' concept is hardly working for SMT systems, it sometimes works for big cores but fails to do the right thing for little cores. Below are two examples to illustrate the problem that this patch solves: 1- If the original capacity of a CPU is less than SCHED_CAPACITY_SCALE (640 as an example), a group of 3 CPUS will have a max capacity_factor of 2 (div_round_closest(3x640/1024) = 2) which means that it will be seen as overloaded even if we have only one task per CPU. 2 - If the original capacity of a CPU is greater than SCHED_CAPACITY_SCALE (1512 as an example), a group of 4 CPUs will have a capacity_factor of 4 (at max and thanks to the fix [0] for SMT system that prevent the apparition of ghost CPUs) but if one CPU is fully used by rt tasks (and its capacity is reduced to nearly nothing), the capacity factor of the group will still be 4 (div_round_closest(3*1512/1024) = 5 which is cap to 4 with [0]). So, this patch tries to solve this issue by removing capacity_factor and replacing it with the 2 following metrics: - The available CPU's capacity for CFS tasks which is already used by load_balance(). - The usage of the CPU by the CFS tasks. For the latter, utilization_avg_contrib has been re-introduced to compute the usage of a CPU by CFS tasks. 'group_capacity_factor' and 'group_has_free_capacity' has been removed and replaced by 'group_no_capacity'. We compare the number of task with the number of CPUs and we evaluate the level of utilization of the CPUs to define if a group is overloaded or if a group has capacity to handle more tasks. For SD_PREFER_SIBLING, a group is tagged overloaded if it has more than 1 task so it will be selected in priority (among the overloaded groups). Since [1], SD_PREFER_SIBLING is no more concerned by the computation of 'load_above_capacity' because local is not overloaded. [1] 9a5d9ba6a363 ("sched/fair: Allow calculate_imbalance() to move idle cpus") Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1425052454-25797-9-git-send-email-vincent.guittot@linaro.org [ Tidied up the changelog. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 139 +++++++++++++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 67 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 471193bdd4b7..7e13dd0fd4b5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5936,11 +5936,10 @@ struct sg_lb_stats { unsigned long group_capacity; unsigned long group_usage; /* Total usage of the group */ unsigned int sum_nr_running; /* Nr tasks running in the group */ - unsigned int group_capacity_factor; unsigned int idle_cpus; unsigned int group_weight; enum group_type group_type; - int group_has_free_capacity; + int group_no_capacity; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; unsigned int nr_preferred_running; @@ -6156,28 +6155,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) } /* - * Try and fix up capacity for tiny siblings, this is needed when - * things like SD_ASYM_PACKING need f_b_g to select another sibling - * which on its own isn't powerful enough. - * - * See update_sd_pick_busiest() and check_asym_packing(). + * Check whether the capacity of the rq has been noticeably reduced by side + * activity. The imbalance_pct is used for the threshold. + * Return true is the capacity is reduced */ static inline int -fix_small_capacity(struct sched_domain *sd, struct sched_group *group) +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { - /* - * Only siblings can have significantly less than SCHED_CAPACITY_SCALE - */ - if (!(sd->flags & SD_SHARE_CPUCAPACITY)) - return 0; - - /* - * If ~90% of the cpu_capacity is still there, we're good. - */ - if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29) - return 1; - - return 0; + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); } /* @@ -6215,37 +6201,56 @@ static inline int sg_imbalanced(struct sched_group *group) } /* - * Compute the group capacity factor. - * - * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by - * first dividing out the smt factor and computing the actual number of cores - * and limit unit capacity with that. + * group_has_capacity returns true if the group has spare capacity that could + * be used by some tasks. + * We consider that a group has spare capacity if the * number of task is + * smaller than the number of CPUs or if the usage is lower than the available + * capacity for CFS tasks. + * For the latter, we use a threshold to stabilize the state, to take into + * account the variance of the tasks' load and to return true if the available + * capacity in meaningful for the load balancer. + * As an example, an available capacity of 1% can appear but it doesn't make + * any benefit for the load balance. */ -static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group) +static inline bool +group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs) { - unsigned int capacity_factor, smt, cpus; - unsigned int capacity, capacity_orig; + if (sgs->sum_nr_running < sgs->group_weight) + return true; - capacity = group->sgc->capacity; - capacity_orig = group->sgc->capacity_orig; - cpus = group->group_weight; + if ((sgs->group_capacity * 100) > + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */ - smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig); - capacity_factor = cpus / smt; /* cores */ + return false; +} + +/* + * group_is_overloaded returns true if the group has more tasks than it can + * handle. + * group_is_overloaded is not equals to !group_has_capacity because a group + * with the exact right number of tasks, has no more spare capacity but is not + * overloaded so both group_has_capacity and group_is_overloaded return + * false. + */ +static inline bool +group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs) +{ + if (sgs->sum_nr_running <= sgs->group_weight) + return false; - capacity_factor = min_t(unsigned, - capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE)); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); + if ((sgs->group_capacity * 100) < + (sgs->group_usage * env->sd->imbalance_pct)) + return true; - return capacity_factor; + return false; } -static enum group_type -group_classify(struct sched_group *group, struct sg_lb_stats *sgs) +static enum group_type group_classify(struct lb_env *env, + struct sched_group *group, + struct sg_lb_stats *sgs) { - if (sgs->sum_nr_running > sgs->group_capacity_factor) + if (sgs->group_no_capacity) return group_overloaded; if (sg_imbalanced(group)) @@ -6306,11 +6311,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running; sgs->group_weight = group->group_weight; - sgs->group_capacity_factor = sg_capacity_factor(env, group); - sgs->group_type = group_classify(group, sgs); - if (sgs->group_capacity_factor > sgs->sum_nr_running) - sgs->group_has_free_capacity = 1; + sgs->group_no_capacity = group_is_overloaded(env, sgs); + sgs->group_type = group_classify(env, group, sgs); } /** @@ -6432,18 +6435,19 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /* * In case the child domain prefers tasks go to siblings - * first, lower the sg capacity factor to one so that we'll try + * first, lower the sg capacity so that we'll try * and move all the excess tasks away. We lower the capacity * of a group only if the local group has the capacity to fit - * these excess tasks, i.e. nr_running < group_capacity_factor. The - * extra check prevents the case where you always pull from the - * heaviest group when it is already under-utilized (possible - * with a large weight task outweighs the tasks on the system). + * these excess tasks. The extra check prevents the case where + * you always pull from the heaviest group when it is already + * under-utilized (possible with a large weight task outweighs + * the tasks on the system). */ if (prefer_sibling && sds->local && - sds->local_stat.group_has_free_capacity) { - sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U); - sgs->group_type = group_classify(sg, sgs); + group_has_capacity(env, &sds->local_stat) && + (sgs->sum_nr_running > 1)) { + sgs->group_no_capacity = 1; + sgs->group_type = group_overloaded; } if (update_sd_pick_busiest(env, sds, sg, sgs)) { @@ -6623,11 +6627,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ if (busiest->group_type == group_overloaded && local->group_type == group_overloaded) { - load_above_capacity = - (busiest->sum_nr_running - busiest->group_capacity_factor); - - load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE); - load_above_capacity /= busiest->group_capacity; + load_above_capacity = busiest->sum_nr_running * + SCHED_LOAD_SCALE; + if (load_above_capacity > busiest->group_capacity) + load_above_capacity -= busiest->group_capacity; + else + load_above_capacity = ~0UL; } /* @@ -6690,6 +6695,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env) local = &sds.local_stat; busiest = &sds.busiest_stat; + /* ASYM feature bypasses nice load balance check */ if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) && check_asym_packing(env, &sds)) return sds.busiest; @@ -6710,8 +6716,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env) goto force_balance; /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity && - !busiest->group_has_free_capacity) + if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + busiest->group_no_capacity) goto force_balance; /* @@ -6770,7 +6776,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, int i; for_each_cpu_and(i, sched_group_cpus(group), env->cpus) { - unsigned long capacity, capacity_factor, wl; + unsigned long capacity, wl; enum fbq_type rt; rq = cpu_rq(i); @@ -6799,9 +6805,6 @@ static struct rq *find_busiest_queue(struct lb_env *env, continue; capacity = capacity_of(i); - capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE); - if (!capacity_factor) - capacity_factor = fix_small_capacity(env->sd, group); wl = weighted_cpuload(i); @@ -6809,7 +6812,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, * When comparing with imbalance, use weighted_cpuload() * which is not scaled with the cpu capacity. */ - if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance) + + if (rq->nr_running == 1 && wl > env->imbalance && + !check_cpu_capacity(rq, env->sd)) continue; /* -- cgit v1.3-8-gc7d7 From dc7ff76eadb4b89fd39bb466b8f3773e5467c11d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 3 Mar 2015 11:35:03 +0100 Subject: sched: Remove unused struct sched_group_capacity::capacity_orig The 'struct sched_group_capacity::capacity_orig' field is no longer used in the scheduler so we can remove it. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425378903-5349-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 12 ------------ kernel/sched/fair.c | 13 +++---------- kernel/sched/sched.h | 2 +- 3 files changed, 4 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7022e9084624..838fc9d1e7ab 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5447,17 +5447,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, break; } - /* - * Even though we initialize ->capacity to something semi-sane, - * we leave capacity_orig unset. This allows us to detect if - * domain iteration is still funny without causing /0 traps. - */ - if (!group->sgc->capacity_orig) { - printk(KERN_CONT "\n"); - printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n"); - break; - } - if (!cpumask_weight(sched_group_cpus(group))) { printk(KERN_CONT "\n"); printk(KERN_ERR "ERROR: empty group\n"); @@ -5941,7 +5930,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) * die on a /0 trap. */ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span); - sg->sgc->capacity_orig = sg->sgc->capacity; /* * Make sure the first group of this domain contains the diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7e13dd0fd4b5..d36f8d221669 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6073,7 +6073,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) capacity >>= SCHED_CAPACITY_SHIFT; cpu_rq(cpu)->cpu_capacity_orig = capacity; - sdg->sgc->capacity_orig = capacity; capacity *= scale_rt_capacity(cpu); capacity >>= SCHED_CAPACITY_SHIFT; @@ -6089,7 +6088,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) { struct sched_domain *child = sd->child; struct sched_group *group, *sdg = sd->groups; - unsigned long capacity, capacity_orig; + unsigned long capacity; unsigned long interval; interval = msecs_to_jiffies(sd->balance_interval); @@ -6101,7 +6100,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu) return; } - capacity_orig = capacity = 0; + capacity = 0; if (child->flags & SD_OVERLAP) { /* @@ -6121,19 +6120,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu) * Use capacity_of(), which is set irrespective of domains * in update_cpu_capacity(). * - * This avoids capacity/capacity_orig from being 0 and + * This avoids capacity from being 0 and * causing divide-by-zero issues on boot. - * - * Runtime updates will correct capacity_orig. */ if (unlikely(!rq->sd)) { - capacity_orig += capacity_orig_of(cpu); capacity += capacity_of(cpu); continue; } sgc = rq->sd->groups->sgc; - capacity_orig += sgc->capacity_orig; capacity += sgc->capacity; } } else { @@ -6144,13 +6139,11 @@ void update_group_capacity(struct sched_domain *sd, int cpu) group = child->groups; do { - capacity_orig += group->sgc->capacity_orig; capacity += group->sgc->capacity; group = group->next; } while (group != child->groups); } - sdg->sgc->capacity_orig = capacity_orig; sdg->sgc->capacity = capacity; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index be56dfd645b2..dd532c558ad4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -826,7 +826,7 @@ struct sched_group_capacity { * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity * for a single CPU. */ - unsigned int capacity, capacity_orig; + unsigned int capacity; unsigned long next_update; int imbalance; /* XXX unrelated to capacity but shared group state */ /* -- cgit v1.3-8-gc7d7 From caff37ef96eac7fe96a582d032f6958e834e9447 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:13 +0100 Subject: sched: Add SD_PREFER_SIBLING for SMT level Add the SD_PREFER_SIBLING flag for SMT level in order to ensure that the scheduler will place at least one task per core. Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Preeti U. Murthy Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-11-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 838fc9d1e7ab..043e2a13b8b9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6240,6 +6240,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu) */ if (sd->flags & SD_SHARE_CPUCAPACITY) { + sd->flags |= SD_PREFER_SIBLING; sd->imbalance_pct = 110; sd->smt_gain = 1178; /* ~15% */ -- cgit v1.3-8-gc7d7 From 1aaf90a4b88aae26a4535ba01dacab520a310d17 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 27 Feb 2015 16:54:14 +0100 Subject: sched: Move CFS tasks to CPUs with higher capacity When a CPU is used to handle a lot of IRQs or some RT tasks, the remaining capacity for CFS tasks can be significantly reduced. Once we detect such situation by comparing cpu_capacity_orig and cpu_capacity, we trig an idle load balance to check if it's worth moving its tasks on an idle CPU. It's worth trying to move the task before the CPU is fully utilized to minimize the preemption by irq or RT tasks. Once the idle load_balance has selected the busiest CPU, it will look for an active load balance for only two cases: - There is only 1 task on the busiest CPU. - We haven't been able to move a task of the busiest rq. A CPU with a reduced capacity is included in the 1st case, and it's worth to actively migrate its task if the idle CPU has got more available capacity for CFS tasks. This test has been added in need_active_balance. As a sidenote, this will not generate more spurious ilb because we already trig an ilb if there is more than 1 busy cpu. If this cpu is the only one that has a task, we will trig the ilb once for migrating the task. The nohz_kick_needed function has been cleaned up a bit while adding the new test env.src_cpu and env.src_rq must be set unconditionnally because they are used in need_active_balance which is called even if busiest->nr_running equals 1 Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Cc: Morten.Rasmussen@arm.com Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: linaro-kernel@lists.linaro.org Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/1425052454-25797-12-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 69 ++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 22 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d36f8d221669..0576ce0e0af2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6855,6 +6855,19 @@ static int need_active_balance(struct lb_env *env) return 1; } + /* + * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. + * It's worth migrating the task if the src_cpu's capacity is reduced + * because of other sched_class or IRQs if more capacity stays + * available on dst_cpu. + */ + if ((env->idle != CPU_NOT_IDLE) && + (env->src_rq->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(env->src_rq, sd)) && + (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100)) + return 1; + } + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } @@ -6954,6 +6967,9 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); + env.src_cpu = busiest->cpu; + env.src_rq = busiest; + ld_moved = 0; if (busiest->nr_running > 1) { /* @@ -6963,8 +6979,6 @@ redo: * correctly treated as an imbalance. */ env.flags |= LBF_ALL_PINNED; - env.src_cpu = busiest->cpu; - env.src_rq = busiest; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -7664,22 +7678,25 @@ end: /* * Current heuristic for kicking the idle load balancer in the presence - * of an idle cpu is the system. + * of an idle cpu in the system. * - This rq has more than one task. - * - At any scheduler domain level, this cpu's scheduler group has multiple - * busy cpu's exceeding the group's capacity. + * - This rq has at least one CFS task and the capacity of the CPU is + * significantly reduced because of RT tasks or IRQs. + * - At parent of LLC scheduler domain level, this cpu's scheduler group has + * multiple busy cpu. * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler * domain span are idle. */ -static inline int nohz_kick_needed(struct rq *rq) +static inline bool nohz_kick_needed(struct rq *rq) { unsigned long now = jiffies; struct sched_domain *sd; struct sched_group_capacity *sgc; int nr_busy, cpu = rq->cpu; + bool kick = false; if (unlikely(rq->idle_balance)) - return 0; + return false; /* * We may be recently in ticked or tickless idle mode. At the first @@ -7693,38 +7710,46 @@ static inline int nohz_kick_needed(struct rq *rq) * balancing. */ if (likely(!atomic_read(&nohz.nr_cpus))) - return 0; + return false; if (time_before(now, nohz.next_balance)) - return 0; + return false; if (rq->nr_running >= 2) - goto need_kick; + return true; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); - if (nr_busy > 1) - goto need_kick_unlock; + if (nr_busy > 1) { + kick = true; + goto unlock; + } + } - sd = rcu_dereference(per_cpu(sd_asym, cpu)); + sd = rcu_dereference(rq->sd); + if (sd) { + if ((rq->cfs.h_nr_running >= 1) && + check_cpu_capacity(rq, sd)) { + kick = true; + goto unlock; + } + } + sd = rcu_dereference(per_cpu(sd_asym, cpu)); if (sd && (cpumask_first_and(nohz.idle_cpus_mask, - sched_domain_span(sd)) < cpu)) - goto need_kick_unlock; - - rcu_read_unlock(); - return 0; + sched_domain_span(sd)) < cpu)) { + kick = true; + goto unlock; + } -need_kick_unlock: +unlock: rcu_read_unlock(); -need_kick: - return 1; + return kick; } #else static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } -- cgit v1.3-8-gc7d7 From dfbca41f347997e57048a53755611c8e2d792924 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2015 14:19:05 +0100 Subject: sched: Optimize freq invariant accounting Currently the freq invariant accounting (in __update_entity_runnable_avg() and sched_rt_avg_update()) get the scale factor from a weak function call, this means that even for archs that default on their implementation the compiler cannot see into this function and optimize the extra scaling math away. This is sad, esp. since its a 64-bit multiplication which can be quite costly on some platforms. So replace the weak function with #ifdef and __always_inline goo. This is not quite as nice from an arch support PoV but should at least result in compile time errors if done wrong. Signed-off-by: Peter Zijlstra (Intel) Cc: Ben Segall Cc: Morten.Rasmussen@arm.com Cc: Paul Turner Cc: Vincent Guittot Cc: dietmar.eggemann@arm.com Cc: efault@gmx.de Cc: kamalesh@linux.vnet.ibm.com Cc: nicolas.pitre@linaro.org Cc: preeti@linux.vnet.ibm.com Cc: riel@redhat.com Link: http://lkml.kernel.org/r/20150323131905.GF23123@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 12 ------------ kernel/sched/sched.h | 9 ++++++++- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0576ce0e0af2..3a798ec36824 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2484,8 +2484,6 @@ static u32 __compute_runnable_contrib(u64 n) return contrib + runnable_avg_yN_sum[n]; } -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu); - /* * We can represent the historical contribution to runnable average as the * coefficients of a geometric series. To do this we sub-divide our runnable @@ -6010,16 +6008,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd, return load_idx; } -static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu) -{ - return SCHED_CAPACITY_SCALE; -} - -unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu) -{ - return default_scale_capacity(sd, cpu); -} - static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu) { if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index dd532c558ad4..91c6736d2522 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1387,7 +1387,14 @@ static inline int hrtick_enabled(struct rq *rq) #ifdef CONFIG_SMP extern void sched_avg_update(struct rq *rq); -extern unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu); + +#ifndef arch_scale_freq_capacity +static __always_inline +unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu) +{ + return SCHED_CAPACITY_SCALE; +} +#endif static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { -- cgit v1.3-8-gc7d7 From d4573c3e1c992668f5dcd57d1c2ced56ae9650b9 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Thu, 26 Mar 2015 18:32:44 +0530 Subject: sched: Improve load balancing in the presence of idle CPUs When a CPU is kicked to do nohz idle balancing, it wakes up to do load balancing on itself, followed by load balancing on behalf of idle CPUs. But it may end up with load after the load balancing attempt on itself. This aborts nohz idle balancing. As a result several idle CPUs are left without tasks till such a time that an ILB CPU finds it unfavorable to pull tasks upon itself. This delays spreading of load across idle CPUs and worse, clutters only a few CPUs with tasks. The effect of the above problem was observed on an SMT8 POWER server with 2 levels of numa domains. Busy loops equal to number of cores were spawned. Since load balancing on fork/exec is discouraged across numa domains, all busy loops would start on one of the numa domains. However it was expected that eventually one busy loop would run per core across all domains due to nohz idle load balancing. But it was observed that it took as long as 10 seconds to spread the load across numa domains. Further investigation showed that this was a consequence of the following: 1. An ILB CPU was chosen from the first numa domain to trigger nohz idle load balancing [Given the experiment, upto 6 CPUs per core could be potentially idle in this domain.] 2. However the ILB CPU would call load_balance() on itself before initiating nohz idle load balancing. 3. Given cores are SMT8, the ILB CPU had enough opportunities to pull tasks from its sibling cores to even out load. 4. Now that the ILB CPU was no longer idle, it would abort nohz idle load balancing As a result the opportunities to spread load across numa domains were lost until such a time that the cores within the first numa domain had equal number of tasks among themselves. This is a pretty bad scenario, since the cores within the first numa domain would have as many as 4 tasks each, while cores in the neighbouring numa domains would all remain idle. Fix this, by checking if a CPU was woken up to do nohz idle load balancing, before it does load balancing upon itself. This way we allow idle CPUs across the system to do load balancing which results in quicker spread of load, instead of performing load balancing within the local sched domain hierarchy of the ILB CPU alone under circumstances such as above. Signed-off-by: Preeti U Murthy Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jason Low Cc: benh@kernel.crashing.org Cc: daniel.lezcano@linaro.org Cc: efault@gmx.de Cc: iamjoonsoo.kim@lge.com Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: riel@redhat.com Cc: srikar@linux.vnet.ibm.com Cc: svaidy@linux.vnet.ibm.com Cc: tim.c.chen@linux.intel.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/20150326130014.21532.17158.stgit@preeti.in.ibm.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3a798ec36824..46855d06666a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7753,14 +7753,16 @@ static void run_rebalance_domains(struct softirq_action *h) enum cpu_idle_type idle = this_rq->idle_balance ? CPU_IDLE : CPU_NOT_IDLE; - rebalance_domains(this_rq, idle); - /* * If this cpu has a pending nohz_balance_kick, then do the * balancing on behalf of the other idle cpus whose ticks are - * stopped. + * stopped. Do nohz_idle_balance *before* rebalance_domains to + * give the idle cpus a chance to load balance. Else we may + * load balance only within the local sched_domain hierarchy + * and abort nohz_idle_balance altogether if we pull some load. */ nohz_idle_balance(this_rq, idle); + rebalance_domains(this_rq, idle); } /* -- cgit v1.3-8-gc7d7 From bd4bde14b93cce8fa77765ff709e0be55abdba2c Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 17 Mar 2015 19:15:30 +0800 Subject: sched/deadline: Avoid a superfluous check Since commit 40767b0dc768 ("sched/deadline: Fix deadline parameter modification handling") we clear the thottled state when switching from a dl task, therefore we should never find it set in switching to a dl task. Signed-off-by: Wanpeng Li [ Improved the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: http://lkml.kernel.org/r/1426590931-4639-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 0a81a954c041..24c18dc10fd7 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1665,14 +1665,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) { int check_resched = 1; - /* - * If p is throttled, don't consider the possibility - * of preempting rq->curr, the check will be done right - * after its runtime will get replenished. - */ - if (unlikely(p->dl.dl_throttled)) - return; - if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded && -- cgit v1.3-8-gc7d7 From a1963b81deec54c113e770b0020e5f1c3188a087 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 17 Mar 2015 19:15:31 +0800 Subject: sched/deadline: Fix rt runtime corruption when dl fails its global constraints One version of sched_rt_global_constaints() (the !rt-cgroup one) changes state, therefore if we fail the later sched_dl_global_constraints() call the state is left in an inconsistent state. Fix this by changing the order of the calls. Signed-off-by: Wanpeng Li [ Improved the changelog. ] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Juri Lelli Link: http://lkml.kernel.org/r/1426590931-4639-2-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 043e2a13b8b9..4b3b6887c6b1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7804,7 +7804,7 @@ static int sched_rt_global_constraints(void) } #endif /* CONFIG_RT_GROUP_SCHED */ -static int sched_dl_global_constraints(void) +static int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); @@ -7905,11 +7905,11 @@ int sched_rt_handler(struct ctl_table *table, int write, if (ret) goto undo; - ret = sched_rt_global_constraints(); + ret = sched_dl_global_validate(); if (ret) goto undo; - ret = sched_dl_global_constraints(); + ret = sched_rt_global_constraints(); if (ret) goto undo; -- cgit v1.3-8-gc7d7 From 876e78818def2983be55878b21f7152fbaebbd36 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2015 10:09:06 +0100 Subject: time: Rename timekeeper::tkr to timekeeper::tkr_mono In preparation of adding another tkr field, rename this one to tkr_mono. Also rename tk_read_base::base_mono to tk_read_base::base, since the structure is not specific to CLOCK_MONOTONIC and the mono name got added to the tk_read_base instance. Lots of trivial churn. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150319093400.344679419@infradead.org Signed-off-by: Ingo Molnar --- arch/arm64/kernel/vdso.c | 10 +-- arch/s390/kernel/time.c | 18 ++--- arch/tile/kernel/time.c | 24 +++--- arch/x86/kernel/vsyscall_gtod.c | 24 +++--- arch/x86/kvm/x86.c | 14 ++-- include/linux/timekeeper_internal.h | 12 +-- kernel/time/timekeeping.c | 150 ++++++++++++++++++------------------ 7 files changed, 126 insertions(+), 126 deletions(-) (limited to 'kernel') diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index 32aeea083d93..ec37ab3f524f 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -200,7 +200,7 @@ up_fail: void update_vsyscall(struct timekeeper *tk) { struct timespec xtime_coarse; - u32 use_syscall = strcmp(tk->tkr.clock->name, "arch_sys_counter"); + u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter"); ++vdso_data->tb_seq_count; smp_wmb(); @@ -213,11 +213,11 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->wtm_clock_nsec = tk->wall_to_monotonic.tv_nsec; if (!use_syscall) { - vdso_data->cs_cycle_last = tk->tkr.cycle_last; + vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec; - vdso_data->cs_mult = tk->tkr.mult; - vdso_data->cs_shift = tk->tkr.shift; + vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; + vdso_data->cs_mult = tk->tkr_mono.mult; + vdso_data->cs_shift = tk->tkr_mono.shift; } smp_wmb(); diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 6c273cd815bb..170ddd2018b3 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -215,20 +215,20 @@ void update_vsyscall(struct timekeeper *tk) { u64 nsecps; - if (tk->tkr.clock != &clocksource_tod) + if (tk->tkr_mono.clock != &clocksource_tod) return; /* Make userspace gettimeofday spin until we're done. */ ++vdso_data->tb_update_count; smp_wmb(); - vdso_data->xtime_tod_stamp = tk->tkr.cycle_last; + vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last; vdso_data->xtime_clock_sec = tk->xtime_sec; - vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec; + vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; vdso_data->wtom_clock_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdso_data->wtom_clock_nsec = tk->tkr.xtime_nsec + - + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr.shift); - nsecps = (u64) NSEC_PER_SEC << tk->tkr.shift; + vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec + + + ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift); + nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift; while (vdso_data->wtom_clock_nsec >= nsecps) { vdso_data->wtom_clock_nsec -= nsecps; vdso_data->wtom_clock_sec++; @@ -236,7 +236,7 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->xtime_coarse_sec = tk->xtime_sec; vdso_data->xtime_coarse_nsec = - (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); + (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); vdso_data->wtom_coarse_sec = vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec; vdso_data->wtom_coarse_nsec = @@ -246,8 +246,8 @@ void update_vsyscall(struct timekeeper *tk) vdso_data->wtom_coarse_sec++; } - vdso_data->tk_mult = tk->tkr.mult; - vdso_data->tk_shift = tk->tkr.shift; + vdso_data->tk_mult = tk->tkr_mono.mult; + vdso_data->tk_shift = tk->tkr_mono.shift; smp_wmb(); ++vdso_data->tb_update_count; } diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c index d412b0856c0a..00178ecf9aea 100644 --- a/arch/tile/kernel/time.c +++ b/arch/tile/kernel/time.c @@ -257,34 +257,34 @@ void update_vsyscall_tz(void) void update_vsyscall(struct timekeeper *tk) { - if (tk->tkr.clock != &cycle_counter_cs) + if (tk->tkr_mono.clock != &cycle_counter_cs) return; write_seqcount_begin(&vdso_data->tb_seq); - vdso_data->cycle_last = tk->tkr.cycle_last; - vdso_data->mask = tk->tkr.mask; - vdso_data->mult = tk->tkr.mult; - vdso_data->shift = tk->tkr.shift; + vdso_data->cycle_last = tk->tkr_mono.cycle_last; + vdso_data->mask = tk->tkr_mono.mask; + vdso_data->mult = tk->tkr_mono.mult; + vdso_data->shift = tk->tkr_mono.shift; vdso_data->wall_time_sec = tk->xtime_sec; - vdso_data->wall_time_snsec = tk->tkr.xtime_nsec; + vdso_data->wall_time_snsec = tk->tkr_mono.xtime_nsec; vdso_data->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdso_data->monotonic_time_snsec = tk->tkr.xtime_nsec + vdso_data->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr.shift); + << tk->tkr_mono.shift); while (vdso_data->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { vdso_data->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr.shift; + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; vdso_data->monotonic_time_sec++; } vdso_data->wall_time_coarse_sec = tk->xtime_sec; - vdso_data->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> - tk->tkr.shift); + vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); vdso_data->monotonic_time_coarse_sec = vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c index c7d791f32b98..51e330416995 100644 --- a/arch/x86/kernel/vsyscall_gtod.c +++ b/arch/x86/kernel/vsyscall_gtod.c @@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk) gtod_write_begin(vdata); /* copy vsyscall data */ - vdata->vclock_mode = tk->tkr.clock->archdata.vclock_mode; - vdata->cycle_last = tk->tkr.cycle_last; - vdata->mask = tk->tkr.mask; - vdata->mult = tk->tkr.mult; - vdata->shift = tk->tkr.shift; + vdata->vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->cycle_last = tk->tkr_mono.cycle_last; + vdata->mask = tk->tkr_mono.mask; + vdata->mult = tk->tkr_mono.mult; + vdata->shift = tk->tkr_mono.shift; vdata->wall_time_sec = tk->xtime_sec; - vdata->wall_time_snsec = tk->tkr.xtime_nsec; + vdata->wall_time_snsec = tk->tkr_mono.xtime_nsec; vdata->monotonic_time_sec = tk->xtime_sec + tk->wall_to_monotonic.tv_sec; - vdata->monotonic_time_snsec = tk->tkr.xtime_nsec + vdata->monotonic_time_snsec = tk->tkr_mono.xtime_nsec + ((u64)tk->wall_to_monotonic.tv_nsec - << tk->tkr.shift); + << tk->tkr_mono.shift); while (vdata->monotonic_time_snsec >= - (((u64)NSEC_PER_SEC) << tk->tkr.shift)) { + (((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) { vdata->monotonic_time_snsec -= - ((u64)NSEC_PER_SEC) << tk->tkr.shift; + ((u64)NSEC_PER_SEC) << tk->tkr_mono.shift; vdata->monotonic_time_sec++; } vdata->wall_time_coarse_sec = tk->xtime_sec; - vdata->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >> - tk->tkr.shift); + vdata->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >> + tk->tkr_mono.shift); vdata->monotonic_time_coarse_sec = vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd7a70be41b3..d7a300e0147f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1070,19 +1070,19 @@ static void update_pvclock_gtod(struct timekeeper *tk) struct pvclock_gtod_data *vdata = &pvclock_gtod_data; u64 boot_ns; - boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot)); + boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot)); write_seqcount_begin(&vdata->seq); /* copy pvclock gtod data */ - vdata->clock.vclock_mode = tk->tkr.clock->archdata.vclock_mode; - vdata->clock.cycle_last = tk->tkr.cycle_last; - vdata->clock.mask = tk->tkr.mask; - vdata->clock.mult = tk->tkr.mult; - vdata->clock.shift = tk->tkr.shift; + vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode; + vdata->clock.cycle_last = tk->tkr_mono.cycle_last; + vdata->clock.mask = tk->tkr_mono.mask; + vdata->clock.mult = tk->tkr_mono.mult; + vdata->clock.shift = tk->tkr_mono.shift; vdata->boot_ns = boot_ns; - vdata->nsec_base = tk->tkr.xtime_nsec; + vdata->nsec_base = tk->tkr_mono.xtime_nsec; write_seqcount_end(&vdata->seq); } diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 05af9a334893..73df17f1535f 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -16,16 +16,16 @@ * @read: Read function of @clock * @mask: Bitmask for two's complement subtraction of non 64bit clocks * @cycle_last: @clock cycle value at last update - * @mult: NTP adjusted multiplier for scaled math conversion + * @mult: (NTP adjusted) multiplier for scaled math conversion * @shift: Shift value for scaled math conversion * @xtime_nsec: Shifted (fractional) nano seconds offset for readout - * @base_mono: ktime_t (nanoseconds) base time for readout + * @base: ktime_t (nanoseconds) base time for readout * * This struct has size 56 byte on 64 bit. Together with a seqcount it * occupies a single 64byte cache line. * * The struct is separate from struct timekeeper as it is also used - * for a fast NMI safe accessor to clock monotonic. + * for a fast NMI safe accessors. */ struct tk_read_base { struct clocksource *clock; @@ -35,12 +35,12 @@ struct tk_read_base { u32 mult; u32 shift; u64 xtime_nsec; - ktime_t base_mono; + ktime_t base; }; /** * struct timekeeper - Structure holding internal timekeeping values. - * @tkr: The readout base structure + * @tkr_mono: The readout base structure for CLOCK_MONOTONIC * @xtime_sec: Current CLOCK_REALTIME time in seconds * @ktime_sec: Current CLOCK_MONOTONIC time in seconds * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset @@ -76,7 +76,7 @@ struct tk_read_base { * used instead. */ struct timekeeper { - struct tk_read_base tkr; + struct tk_read_base tkr_mono; u64 xtime_sec; unsigned long ktime_sec; struct timespec64 wall_to_monotonic; diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 892f6cbf1e67..1405091f3acb 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -68,8 +68,8 @@ bool __read_mostly persistent_clock_exist = false; static inline void tk_normalize_xtime(struct timekeeper *tk) { - while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) { - tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift; + while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { + tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift; tk->xtime_sec++; } } @@ -79,20 +79,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk) struct timespec64 ts; ts.tv_sec = tk->xtime_sec; - ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift); + ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); return ts; } static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec = ts->tv_sec; - tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift; + tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift; } static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts) { tk->xtime_sec += ts->tv_sec; - tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift; + tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift; tk_normalize_xtime(tk); } @@ -136,8 +136,8 @@ static long timekeeping_last_warning; static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset) { - cycle_t max_cycles = tk->tkr.clock->max_cycles; - const char *name = tk->tkr.clock->name; + cycle_t max_cycles = tk->tkr_mono.clock->max_cycles; + const char *name = tk->tkr_mono.clock->name; if (offset > max_cycles) { printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n", @@ -246,11 +246,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; - old_clock = tk->tkr.clock; - tk->tkr.clock = clock; - tk->tkr.read = clock->read; - tk->tkr.mask = clock->mask; - tk->tkr.cycle_last = tk->tkr.read(clock); + old_clock = tk->tkr_mono.clock; + tk->tkr_mono.clock = clock; + tk->tkr_mono.read = clock->read; + tk->tkr_mono.mask = clock->mask; + tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; @@ -274,11 +274,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) if (old_clock) { int shift_change = clock->shift - old_clock->shift; if (shift_change < 0) - tk->tkr.xtime_nsec >>= -shift_change; + tk->tkr_mono.xtime_nsec >>= -shift_change; else - tk->tkr.xtime_nsec <<= shift_change; + tk->tkr_mono.xtime_nsec <<= shift_change; } - tk->tkr.shift = clock->shift; + tk->tkr_mono.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; @@ -289,7 +289,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) * active clocksource. These value will be adjusted via NTP * to counteract clock drifting. */ - tk->tkr.mult = clock->mult; + tk->tkr_mono.mult = clock->mult; tk->ntp_err_mult = 0; } @@ -318,11 +318,11 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr.clock; + struct clocksource *clock = tk->tkr_mono.clock; cycle_t delta; s64 nsec; - delta = timekeeping_get_delta(&tk->tkr); + delta = timekeeping_get_delta(&tk->tkr_mono); /* convert delta to nanoseconds. */ nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); @@ -428,7 +428,7 @@ u64 notrace ktime_get_mono_fast_ns(void) do { seq = raw_read_seqcount(&tk_fast_mono.seq); tkr = tk_fast_mono.base + (seq & 0x01); - now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); return now; @@ -456,7 +456,7 @@ static cycle_t dummy_clock_read(struct clocksource *cs) static void halt_fast_timekeeper(struct timekeeper *tk) { static struct tk_read_base tkr_dummy; - struct tk_read_base *tkr = &tk->tkr; + struct tk_read_base *tkr = &tk->tkr_mono; memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tkr->read(tkr->clock); @@ -472,8 +472,8 @@ static inline void update_vsyscall(struct timekeeper *tk) xt = timespec64_to_timespec(tk_xtime(tk)); wm = timespec64_to_timespec(tk->wall_to_monotonic); - update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult, - tk->tkr.cycle_last); + update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult, + tk->tkr_mono.cycle_last); } static inline void old_vsyscall_fixup(struct timekeeper *tk) @@ -490,11 +490,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk) * (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD * users are removed, this can be killed. */ - remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1); - tk->tkr.xtime_nsec -= remainder; - tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift; + remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1); + tk->tkr_mono.xtime_nsec -= remainder; + tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift; tk->ntp_error += remainder << tk->ntp_error_shift; - tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift; + tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift; } #else #define old_vsyscall_fixup(tk) @@ -559,7 +559,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) */ seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec); nsec = (u32) tk->wall_to_monotonic.tv_nsec; - tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); + tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* Update the monotonic raw base */ tk->base_raw = timespec64_to_ktime(tk->raw_time); @@ -569,7 +569,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) * wall_to_monotonic can be greater/equal one second. Take * this into account before updating tk->ktime_sec. */ - nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift); + nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); if (nsec >= NSEC_PER_SEC) seconds++; tk->ktime_sec = seconds; @@ -592,7 +592,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - update_fast_timekeeper(&tk->tkr); + update_fast_timekeeper(&tk->tkr_mono); } /** @@ -604,18 +604,18 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) */ static void timekeeping_forward_now(struct timekeeper *tk) { - struct clocksource *clock = tk->tkr.clock; + struct clocksource *clock = tk->tkr_mono.clock; cycle_t cycle_now, delta; s64 nsec; - cycle_now = tk->tkr.read(clock); - delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask); - tk->tkr.cycle_last = cycle_now; + cycle_now = tk->tkr_mono.read(clock); + delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); + tk->tkr_mono.cycle_last = cycle_now; - tk->tkr.xtime_nsec += delta * tk->tkr.mult; + tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; /* If arch requires, add in get_arch_timeoffset() */ - tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift; + tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift; tk_normalize_xtime(tk); @@ -640,7 +640,7 @@ int __getnstimeofday64(struct timespec64 *ts) seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsecs = timekeeping_get_ns(&tk->tkr); + nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -680,8 +680,8 @@ ktime_t ktime_get(void) do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->tkr.base_mono; - nsecs = timekeeping_get_ns(&tk->tkr); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -706,8 +706,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs) do { seq = read_seqcount_begin(&tk_core.seq); - base = ktime_add(tk->tkr.base_mono, *offset); - nsecs = timekeeping_get_ns(&tk->tkr); + base = ktime_add(tk->tkr_mono.base, *offset); + nsecs = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -777,7 +777,7 @@ void ktime_get_ts64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); ts->tv_sec = tk->xtime_sec; - nsec = timekeeping_get_ns(&tk->tkr); + nsec = timekeeping_get_ns(&tk->tkr_mono); tomono = tk->wall_to_monotonic; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -863,7 +863,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) ts_real->tv_nsec = 0; nsecs_raw = timekeeping_get_ns_raw(tk); - nsecs_real = timekeeping_get_ns(&tk->tkr); + nsecs_real = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1046,7 +1046,7 @@ static int change_clocksource(void *data) */ if (try_module_get(new->owner)) { if (!new->enable || new->enable(new) == 0) { - old = tk->tkr.clock; + old = tk->tkr_mono.clock; tk_setup_internals(tk, new); if (old->disable) old->disable(old); @@ -1074,11 +1074,11 @@ int timekeeping_notify(struct clocksource *clock) { struct timekeeper *tk = &tk_core.timekeeper; - if (tk->tkr.clock == clock) + if (tk->tkr_mono.clock == clock) return 0; stop_machine(change_clocksource, clock, NULL); tick_clock_notify(); - return tk->tkr.clock == clock ? 0 : -1; + return tk->tkr_mono.clock == clock ? 0 : -1; } /** @@ -1119,7 +1119,7 @@ int timekeeping_valid_for_hres(void) do { seq = read_seqcount_begin(&tk_core.seq); - ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; + ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1138,7 +1138,7 @@ u64 timekeeping_max_deferment(void) do { seq = read_seqcount_begin(&tk_core.seq); - ret = tk->tkr.clock->max_idle_ns; + ret = tk->tkr_mono.clock->max_idle_ns; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1303,7 +1303,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) void timekeeping_resume(void) { struct timekeeper *tk = &tk_core.timekeeper; - struct clocksource *clock = tk->tkr.clock; + struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; struct timespec tmp; @@ -1331,16 +1331,16 @@ void timekeeping_resume(void) * The less preferred source will only be tried if there is no better * usable source. The rtc part is handled separately in rtc core code. */ - cycle_now = tk->tkr.read(clock); + cycle_now = tk->tkr_mono.read(clock); if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) && - cycle_now > tk->tkr.cycle_last) { + cycle_now > tk->tkr_mono.cycle_last) { u64 num, max = ULLONG_MAX; u32 mult = clock->mult; u32 shift = clock->shift; s64 nsec = 0; - cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, - tk->tkr.mask); + cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, + tk->tkr_mono.mask); /* * "cycle_delta * mutl" may cause 64 bits overflow, if the @@ -1366,7 +1366,7 @@ void timekeeping_resume(void) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ - tk->tkr.cycle_last = cycle_now; + tk->tkr_mono.cycle_last = cycle_now; tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1519,15 +1519,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk, * * XXX - TODO: Doc ntp_error calculation. */ - if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) { + if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) { /* NTP adjustment caused clocksource mult overflow */ WARN_ON_ONCE(1); return; } - tk->tkr.mult += mult_adj; + tk->tkr_mono.mult += mult_adj; tk->xtime_interval += interval; - tk->tkr.xtime_nsec -= offset; + tk->tkr_mono.xtime_nsec -= offset; tk->ntp_error -= (interval - offset) << tk->ntp_error_shift; } @@ -1589,13 +1589,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) tk->ntp_err_mult = 0; } - if (unlikely(tk->tkr.clock->maxadj && - (abs(tk->tkr.mult - tk->tkr.clock->mult) - > tk->tkr.clock->maxadj))) { + if (unlikely(tk->tkr_mono.clock->maxadj && + (abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult) + > tk->tkr_mono.clock->maxadj))) { printk_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", - tk->tkr.clock->name, (long)tk->tkr.mult, - (long)tk->tkr.clock->mult + tk->tkr.clock->maxadj); + tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult, + (long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj); } /* @@ -1612,9 +1612,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) * We'll correct this error next time through this function, when * xtime_nsec is not as small. */ - if (unlikely((s64)tk->tkr.xtime_nsec < 0)) { - s64 neg = -(s64)tk->tkr.xtime_nsec; - tk->tkr.xtime_nsec = 0; + if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) { + s64 neg = -(s64)tk->tkr_mono.xtime_nsec; + tk->tkr_mono.xtime_nsec = 0; tk->ntp_error += neg << tk->ntp_error_shift; } } @@ -1629,13 +1629,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) */ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk) { - u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift; + u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift; unsigned int clock_set = 0; - while (tk->tkr.xtime_nsec >= nsecps) { + while (tk->tkr_mono.xtime_nsec >= nsecps) { int leap; - tk->tkr.xtime_nsec -= nsecps; + tk->tkr_mono.xtime_nsec -= nsecps; tk->xtime_sec++; /* Figure out if its a leap sec and apply if needed */ @@ -1680,9 +1680,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; - tk->tkr.cycle_last += interval; + tk->tkr_mono.cycle_last += interval; - tk->tkr.xtime_nsec += tk->xtime_interval << shift; + tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); /* Accumulate raw time */ @@ -1725,8 +1725,8 @@ void update_wall_time(void) #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET offset = real_tk->cycle_interval; #else - offset = clocksource_delta(tk->tkr.read(tk->tkr.clock), - tk->tkr.cycle_last, tk->tkr.mask); + offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock), + tk->tkr_mono.cycle_last, tk->tkr_mono.mask); #endif /* Check if there's really nothing to do */ @@ -1890,8 +1890,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot, do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->tkr.base_mono; - nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift; + base = tk->tkr_mono.base; + nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift; *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; @@ -1922,8 +1922,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot, do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->tkr.base_mono; - nsecs = timekeeping_get_ns(&tk->tkr); + base = tk->tkr_mono.base; + nsecs = timekeeping_get_ns(&tk->tkr_mono); *offs_real = tk->offs_real; *offs_boot = tk->offs_boot; -- cgit v1.3-8-gc7d7 From 4a4ad80d32cea69ee93bd4589f24dc478804cd80 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2015 09:28:44 +0100 Subject: time: Add timerkeeper::tkr_raw Introduce tkr_raw and make use of it. base_raw -> tkr_raw.base clock->{mult,shift} -> tkr_raw.{mult.shift} Kill timekeeping_get_ns_raw() in favour of timekeeping_get_ns(&tkr_raw), this removes all mono_raw special casing. Duplicate the updates to tkr_mono.cycle_last into tkr_raw.cycle_last, both need the same value. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150319093400.422589590@infradead.org Signed-off-by: Ingo Molnar --- include/linux/timekeeper_internal.h | 4 ++-- kernel/time/timekeeping.c | 41 +++++++++++++++++-------------------- 2 files changed, 21 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index 73df17f1535f..fb86963859c7 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -41,6 +41,7 @@ struct tk_read_base { /** * struct timekeeper - Structure holding internal timekeeping values. * @tkr_mono: The readout base structure for CLOCK_MONOTONIC + * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @xtime_sec: Current CLOCK_REALTIME time in seconds * @ktime_sec: Current CLOCK_MONOTONIC time in seconds * @wall_to_monotonic: CLOCK_REALTIME to CLOCK_MONOTONIC offset @@ -48,7 +49,6 @@ struct tk_read_base { * @offs_boot: Offset clock monotonic -> clock boottime * @offs_tai: Offset clock monotonic -> clock tai * @tai_offset: The current UTC to TAI offset in seconds - * @base_raw: Monotonic raw base time in ktime_t format * @raw_time: Monotonic raw base time in timespec64 format * @cycle_interval: Number of clock cycles in one NTP interval * @xtime_interval: Number of clock shifted nano seconds in one NTP @@ -77,6 +77,7 @@ struct tk_read_base { */ struct timekeeper { struct tk_read_base tkr_mono; + struct tk_read_base tkr_raw; u64 xtime_sec; unsigned long ktime_sec; struct timespec64 wall_to_monotonic; @@ -84,7 +85,6 @@ struct timekeeper { ktime_t offs_boot; ktime_t offs_tai; s32 tai_offset; - ktime_t base_raw; struct timespec64 raw_time; /* The following members are for timekeeping internal use */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 1405091f3acb..cbb612ee813f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -252,6 +252,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_mono.mask = clock->mask; tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock); + tk->tkr_raw.clock = clock; + tk->tkr_raw.read = clock->read; + tk->tkr_raw.mask = clock->mask; + tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last; + /* Do the ns -> cycle conversion first, using original mult */ tmp = NTP_INTERVAL_LENGTH; tmp <<= clock->shift; @@ -278,7 +283,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) else tk->tkr_mono.xtime_nsec <<= shift_change; } + tk->tkr_raw.xtime_nsec = 0; + tk->tkr_mono.shift = clock->shift; + tk->tkr_raw.shift = clock->shift; tk->ntp_error = 0; tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift; @@ -290,6 +298,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) * to counteract clock drifting. */ tk->tkr_mono.mult = clock->mult; + tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; } @@ -316,21 +325,6 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) return nsec + arch_gettimeoffset(); } -static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk) -{ - struct clocksource *clock = tk->tkr_mono.clock; - cycle_t delta; - s64 nsec; - - delta = timekeeping_get_delta(&tk->tkr_mono); - - /* convert delta to nanoseconds. */ - nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); - - /* If arch requires, add in get_arch_timeoffset() */ - return nsec + arch_gettimeoffset(); -} - /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -562,7 +556,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec); /* Update the monotonic raw base */ - tk->base_raw = timespec64_to_ktime(tk->raw_time); + tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time); /* * The sum of the nanoseconds portions of xtime and @@ -611,6 +605,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) cycle_now = tk->tkr_mono.read(clock); delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask); tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult; @@ -619,7 +614,7 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_normalize_xtime(tk); - nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift); + nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift); timespec64_add_ns(&tk->raw_time, nsec); } @@ -748,8 +743,8 @@ ktime_t ktime_get_raw(void) do { seq = read_seqcount_begin(&tk_core.seq); - base = tk->base_raw; - nsecs = timekeeping_get_ns_raw(tk); + base = tk->tkr_raw.base; + nsecs = timekeeping_get_ns(&tk->tkr_raw); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -862,7 +857,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real) ts_real->tv_sec = tk->xtime_sec; ts_real->tv_nsec = 0; - nsecs_raw = timekeeping_get_ns_raw(tk); + nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); nsecs_real = timekeeping_get_ns(&tk->tkr_mono); } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1096,7 +1091,7 @@ void getrawmonotonic64(struct timespec64 *ts) do { seq = read_seqcount_begin(&tk_core.seq); - nsecs = timekeeping_get_ns_raw(tk); + nsecs = timekeeping_get_ns(&tk->tkr_raw); ts64 = tk->raw_time; } while (read_seqcount_retry(&tk_core.seq, seq)); @@ -1217,7 +1212,6 @@ void __init timekeeping_init(void) tk_set_xtime(tk, &now); tk->raw_time.tv_sec = 0; tk->raw_time.tv_nsec = 0; - tk->base_raw.tv64 = 0; if (boot.tv_sec == 0 && boot.tv_nsec == 0) boot = tk_xtime(tk); @@ -1367,6 +1361,8 @@ void timekeeping_resume(void) /* Re-base the last cycle value */ tk->tkr_mono.cycle_last = cycle_now; + tk->tkr_raw.cycle_last = cycle_now; + tk->ntp_error = 0; timekeeping_suspended = 0; timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET); @@ -1681,6 +1677,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset, /* Accumulate one shifted interval */ offset -= interval; tk->tkr_mono.cycle_last += interval; + tk->tkr_raw.cycle_last += interval; tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift; *clock_set |= accumulate_nsecs_to_secs(tk); -- cgit v1.3-8-gc7d7 From 4498e7467e9e441c18ca12f1ca08460356e0508a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2015 09:36:19 +0100 Subject: time: Parametrize all tk_fast_mono users In preparation for more tk_fast instances, remove all hard-coded tk_fast_mono references. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150319093400.484279927@infradead.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index cbb612ee813f..278373edb472 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -364,18 +364,18 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) * slightly wrong timestamp (a few nanoseconds). See * @ktime_get_mono_fast_ns. */ -static void update_fast_timekeeper(struct tk_read_base *tkr) +static void update_fast_timekeeper(struct tk_read_base *tkr, struct tk_fast *tkf) { - struct tk_read_base *base = tk_fast_mono.base; + struct tk_read_base *base = tkf->base; /* Force readers off to base[1] */ - raw_write_seqcount_latch(&tk_fast_mono.seq); + raw_write_seqcount_latch(&tkf->seq); /* Update base[0] */ memcpy(base, tkr, sizeof(*base)); /* Force readers back to base[0] */ - raw_write_seqcount_latch(&tk_fast_mono.seq); + raw_write_seqcount_latch(&tkf->seq); /* Update base[1] */ memcpy(base + 1, base, sizeof(*base)); @@ -413,20 +413,25 @@ static void update_fast_timekeeper(struct tk_read_base *tkr) * of the following timestamps. Callers need to be aware of that and * deal with it. */ -u64 notrace ktime_get_mono_fast_ns(void) +static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) { struct tk_read_base *tkr; unsigned int seq; u64 now; do { - seq = raw_read_seqcount(&tk_fast_mono.seq); - tkr = tk_fast_mono.base + (seq & 0x01); + seq = raw_read_seqcount(&tkf->seq); + tkr = tkf->base + (seq & 0x01); now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + } while (read_seqcount_retry(&tkf->seq, seq)); - } while (read_seqcount_retry(&tk_fast_mono.seq, seq)); return now; } + +u64 ktime_get_mono_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_mono); +} EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); /* Suspend-time cycles value for halted fast timekeeper. */ @@ -455,7 +460,7 @@ static void halt_fast_timekeeper(struct timekeeper *tk) memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); cycles_at_suspend = tkr->read(tkr->clock); tkr_dummy.read = dummy_clock_read; - update_fast_timekeeper(&tkr_dummy); + update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); } #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD @@ -586,7 +591,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) memcpy(&shadow_timekeeper, &tk_core.timekeeper, sizeof(tk_core.timekeeper)); - update_fast_timekeeper(&tk->tkr_mono); + update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); } /** -- cgit v1.3-8-gc7d7 From f09cb9a1808e35ad7502ea39b6bfb443c7fa0f19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2015 09:39:08 +0100 Subject: time: Introduce tk_fast_raw Add the NMI safe CLOCK_MONOTONIC_RAW accessor.. Signed-off-by: Peter Zijlstra (Intel) Acked-by: John Stultz Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150319093400.562746929@infradead.org Signed-off-by: Ingo Molnar --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 3eaae4754275..f36b1edf3f73 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -220,6 +220,7 @@ static inline u64 ktime_get_raw_ns(void) } extern u64 ktime_get_mono_fast_ns(void); +extern u64 ktime_get_raw_fast_ns(void); /* * Timespec interfaces utilizing the ktime based ones diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 278373edb472..c3fcff06d30a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -59,6 +59,7 @@ struct tk_fast { }; static struct tk_fast tk_fast_mono ____cacheline_aligned; +static struct tk_fast tk_fast_raw ____cacheline_aligned; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; @@ -434,6 +435,12 @@ u64 ktime_get_mono_fast_ns(void) } EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns); +u64 ktime_get_raw_fast_ns(void) +{ + return __ktime_get_fast_ns(&tk_fast_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns); + /* Suspend-time cycles value for halted fast timekeeper. */ static cycle_t cycles_at_suspend; @@ -461,6 +468,11 @@ static void halt_fast_timekeeper(struct timekeeper *tk) cycles_at_suspend = tkr->read(tkr->clock); tkr_dummy.read = dummy_clock_read; update_fast_timekeeper(&tkr_dummy, &tk_fast_mono); + + tkr = &tk->tkr_raw; + memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy)); + tkr_dummy.read = dummy_clock_read; + update_fast_timekeeper(&tkr_dummy, &tk_fast_raw); } #ifdef CONFIG_GENERIC_TIME_VSYSCALL_OLD @@ -592,6 +604,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action) sizeof(tk_core.timekeeper)); update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono); + update_fast_timekeeper(&tk->tkr_raw, &tk_fast_raw); } /** -- cgit v1.3-8-gc7d7 From ccd41c86ad4d464d0ed4e48d80759ff85c2115b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Feb 2015 15:56:04 +0100 Subject: perf: Fix racy group access While looking at some fuzzer output I noticed that we do not hold any locks on leader->ctx and therefore the sibling_list iteration is unsafe. Acquire the relevant ctx->mutex before calling into the pmu specific code. Signed-off-by: Peter Zijlstra (Intel) Cc: Vince Weaver Cc: Jiri Olsa Cc: Sasha Levin Link: http://lkml.kernel.org/r/20150225151639.GL5029@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/events/core.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index b01dfb602db1..bb1a7c36e794 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7036,12 +7036,23 @@ EXPORT_SYMBOL_GPL(perf_pmu_unregister); static int perf_try_init_event(struct pmu *pmu, struct perf_event *event) { + struct perf_event_context *ctx = NULL; int ret; if (!try_module_get(pmu->module)) return -ENODEV; + + if (event->group_leader != event) { + ctx = perf_event_ctx_lock(event->group_leader); + BUG_ON(!ctx); + } + event->pmu = pmu; ret = pmu->event_init(event); + + if (ctx) + perf_event_ctx_unlock(event->group_leader, ctx); + if (ret) module_put(pmu->module); -- cgit v1.3-8-gc7d7 From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Feb 2015 14:05:38 +0100 Subject: perf: Add per event clockid support While thinking on the whole clock discussion it occurred to me we have two distinct uses of time: 1) the tracking of event/ctx/cgroup enabled/running/stopped times which includes the self-monitoring support in struct perf_event_mmap_page. 2) the actual timestamps visible in the data records. And we've been conflating them. The first is all about tracking time deltas, nobody should really care in what time base that happens, its all relative information, as long as its internally consistent it works. The second however is what people are worried about when having to merge their data with external sources. And here we have the discussion on MONOTONIC vs MONOTONIC_RAW etc.. Where MONOTONIC is good for correlating between machines (static offset), MONOTNIC_RAW is required for correlating against a fixed rate hardware clock. This means configurability; now 1) makes that hard because it needs to be internally consistent across groups of unrelated events; which is why we had to have a global perf_clock(). However, for 2) it doesn't really matter, perf itself doesn't care what it writes into the buffer. The below patch makes the distinction between these two cases by adding perf_event_clock() which is used for the second case. It further makes this configurable on a per-event basis, but adds a few sanity checks such that we cannot combine events with different clocks in confusing ways. And since we then have per-event configurability we might as well retain the 'legacy' behaviour as a default. Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: David Ahern Cc: Jiri Olsa Cc: John Stultz Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 14 ++++++-- include/linux/perf_event.h | 2 ++ include/uapi/linux/perf_event.h | 6 ++-- kernel/events/core.c | 77 ++++++++++++++++++++++++++++++++++++++-- 4 files changed, 91 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ac41b3ad1fc9..0420ebcac116 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event, data = cyc2ns_read_begin(); + /* + * Internal timekeeping for enabled/running/stopped times + * is always in the local_clock domain. + */ userpg->cap_user_time = 1; userpg->time_mult = data->cyc2ns_mul; userpg->time_shift = data->cyc2ns_shift; userpg->time_offset = data->cyc2ns_offset - now; - userpg->cap_user_time_zero = 1; - userpg->time_zero = data->cyc2ns_offset; + /* + * cap_user_time_zero doesn't make sense when we're using a different + * time base for the records. + */ + if (event->clock == &local_clock) { + userpg->cap_user_time_zero = 1; + userpg->time_zero = data->cyc2ns_offset; + } cyc2ns_read_end(data); } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index b16eac5f54ce..401554074de9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -173,6 +173,7 @@ struct perf_event; * pmu::capabilities flags */ #define PERF_PMU_CAP_NO_INTERRUPT 0x01 +#define PERF_PMU_CAP_NO_NMI 0x02 /** * struct pmu - generic performance monitoring unit @@ -457,6 +458,7 @@ struct perf_event { struct pid_namespace *ns; u64 id; + u64 (*clock)(void); perf_overflow_handler_t overflow_handler; void *overflow_handler_context; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1e3cd07cf76e..3bb40ddadbe5 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -326,7 +326,8 @@ struct perf_event_attr { exclude_callchain_user : 1, /* exclude user callchains */ mmap2 : 1, /* include mmap with inode data */ comm_exec : 1, /* flag comm events that are due to an exec */ - __reserved_1 : 39; + use_clockid : 1, /* use @clockid for time fields */ + __reserved_1 : 38; union { __u32 wakeup_events; /* wakeup every n events */ @@ -355,8 +356,7 @@ struct perf_event_attr { */ __u32 sample_stack_user; - /* Align to u64. */ - __u32 __reserved_2; + __s32 clockid; /* * Defines set of regs to dump for each sample * state captured on: diff --git a/kernel/events/core.c b/kernel/events/core.c index bb1a7c36e794..c40c2cac2d8e 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -327,6 +327,11 @@ static inline u64 perf_clock(void) return local_clock(); } +static inline u64 perf_event_clock(struct perf_event *event) +{ + return event->clock(); +} + static inline struct perf_cpu_context * __get_cpu_context(struct perf_event_context *ctx) { @@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header, } if (sample_type & PERF_SAMPLE_TIME) - data->time = perf_clock(); + data->time = perf_event_clock(event); if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER)) data->id = primary_event_id(event); @@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); + task_event->event_id.time = perf_event_clock(event); + perf_output_put(&handle, task_event->event_id); perf_event__output_id_sample(event, &handle, &sample); @@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ - .time = perf_clock(), + /* .time */ }, }; @@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable) .misc = 0, .size = sizeof(throttle_event), }, - .time = perf_clock(), + .time = perf_event_clock(event), .id = primary_event_id(event), .stream_id = event->id, }; @@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event) static struct pmu perf_swevent = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, @@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event) static struct pmu perf_cpu_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, @@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event) static struct pmu perf_task_clock = { .task_ctx_nr = perf_sw_context, + .capabilities = PERF_PMU_CAP_NO_NMI, + .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, @@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->hw.target = task; } + event->clock = &local_clock; + if (parent_event) + event->clock = parent_event->clock; + if (!overflow_handler && parent_event) { overflow_handler = parent_event->overflow_handler; context = parent_event->overflow_handler_context; @@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; + /* + * Mixing clocks in the same buffer is trouble you don't need. + */ + if (output_event->clock != event->clock) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ @@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b) mutex_lock_nested(b, SINGLE_DEPTH_NESTING); } +static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id) +{ + bool nmi_safe = false; + + switch (clk_id) { + case CLOCK_MONOTONIC: + event->clock = &ktime_get_mono_fast_ns; + nmi_safe = true; + break; + + case CLOCK_MONOTONIC_RAW: + event->clock = &ktime_get_raw_fast_ns; + nmi_safe = true; + break; + + case CLOCK_REALTIME: + event->clock = &ktime_get_real_ns; + break; + + case CLOCK_BOOTTIME: + event->clock = &ktime_get_boot_ns; + break; + + case CLOCK_TAI: + event->clock = &ktime_get_tai_ns; + break; + + default: + return -EINVAL; + } + + if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI)) + return -EINVAL; + + return 0; +} + /** * sys_perf_event_open - open a performance event, associate it to a task/cpu * @@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open, */ pmu = event->pmu; + if (attr.use_clockid) { + err = perf_event_set_clock(event, attr.clockid); + if (err) + goto err_alloc; + } + if (group_leader && (is_software_event(event) != is_software_event(group_leader))) { if (is_software_event(event)) { @@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open, */ if (group_leader->group_leader != group_leader) goto err_context; + + /* All events in a group should have the same clock */ + if (group_leader->clock != event->clock) + goto err_context; + /* * Do not allow to attach to a group in a different * task or CPU context: -- cgit v1.3-8-gc7d7 From 554ef3876c6acdff1331feab10275e9e9e0adb84 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Feb 2015 17:21:32 +0530 Subject: clockevents: Handle tick device's resume separately Upcoming patch will redefine possible states of a clockevent device. The RESUME mode is a special case only for tick's clockevent devices. In future it can be replaced by ->resume() callback already available for clockevent devices. Lets handle it separately so that clockevents_set_mode() only handles states valid across all devices. This also renames set_mode_resume() to tick_resume() to make it more explicit. Signed-off-by: Viresh Kumar Acked-by: Peter Zijlstra Cc: Daniel Lezcano Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: linaro-kernel@lists.linaro.org Cc: linaro-networking@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/c1b0112410870f49e7bf06958e1483eac6c15e20.1425037853.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 4 ++-- kernel/time/clockevents.c | 30 +++++++++++++++++++++--------- kernel/time/tick-broadcast.c | 2 +- kernel/time/tick-common.c | 2 +- kernel/time/tick-internal.h | 1 + kernel/time/timer_list.c | 4 ++-- 6 files changed, 28 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 59af26b54d15..a41749543d48 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -87,7 +87,7 @@ enum clock_event_mode { * @set_mode_periodic: switch mode to periodic, if !set_mode * @set_mode_oneshot: switch mode to oneshot, if !set_mode * @set_mode_shutdown: switch mode to shutdown, if !set_mode - * @set_mode_resume: resume clkevt device, if !set_mode + * @tick_resume: resume clkevt device, if !set_mode * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration @@ -125,7 +125,7 @@ struct clock_event_device { int (*set_mode_periodic)(struct clock_event_device *); int (*set_mode_oneshot)(struct clock_event_device *); int (*set_mode_shutdown)(struct clock_event_device *); - int (*set_mode_resume)(struct clock_event_device *); + int (*tick_resume)(struct clock_event_device *); void (*broadcast)(const struct cpumask *mask); void (*suspend)(struct clock_event_device *); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 489642b08d64..1b0ea63de69c 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -100,7 +100,7 @@ static int __clockevents_set_mode(struct clock_event_device *dev, /* Transition with legacy set_mode() callback */ if (dev->set_mode) { /* Legacy callback doesn't support new modes */ - if (mode > CLOCK_EVT_MODE_RESUME) + if (mode > CLOCK_EVT_MODE_ONESHOT) return -ENOSYS; dev->set_mode(mode, dev); return 0; @@ -133,13 +133,6 @@ static int __clockevents_set_mode(struct clock_event_device *dev, return -ENOSYS; return dev->set_mode_oneshot(dev); - case CLOCK_EVT_MODE_RESUME: - /* Optional callback */ - if (dev->set_mode_resume) - return dev->set_mode_resume(dev); - else - return 0; - default: return -ENOSYS; } @@ -184,6 +177,25 @@ void clockevents_shutdown(struct clock_event_device *dev) dev->next_event.tv64 = KTIME_MAX; } +/** + * clockevents_tick_resume - Resume the tick device before using it again + * @dev: device to resume + */ +int clockevents_tick_resume(struct clock_event_device *dev) +{ + int ret = 0; + + if (dev->set_mode) + dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); + else if (dev->tick_resume) + ret = dev->tick_resume(dev); + + if (likely(!ret)) + dev->mode = CLOCK_EVT_MODE_RESUME; + + return ret; +} + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST /* Limit min_delta to a jiffie */ @@ -433,7 +445,7 @@ static int clockevents_sanity_check(struct clock_event_device *dev) if (dev->set_mode) { /* We shouldn't be supporting new modes now */ WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot || - dev->set_mode_shutdown || dev->set_mode_resume); + dev->set_mode_shutdown || dev->tick_resume); return 0; } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 066f0ec05e48..542d5bb5c13d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -464,7 +464,7 @@ int tick_resume_broadcast(void) bc = tick_broadcast_device.evtdev; if (bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_RESUME); + clockevents_tick_resume(bc); switch (tick_broadcast_device.mode) { case TICKDEV_MODE_PERIODIC: diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index f7c515595b42..5c50664c21d7 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -384,7 +384,7 @@ void tick_resume(void) struct tick_device *td = this_cpu_ptr(&tick_cpu_device); int broadcast = tick_resume_broadcast(); - clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_RESUME); + clockevents_tick_resume(td->evtdev); if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 366aeb4f2c66..98700e4a2000 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -32,6 +32,7 @@ extern bool tick_check_replacement(struct clock_event_device *curdev, extern void tick_install_replacement(struct clock_event_device *dev); extern void clockevents_shutdown(struct clock_event_device *dev); +extern int clockevents_tick_resume(struct clock_event_device *dev); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 2cfd19485824..2b3e9393034d 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -251,9 +251,9 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, "\n"); } - if (dev->set_mode_resume) { + if (dev->tick_resume) { SEQ_printf(m, " resume: "); - print_name_offset(m, dev->set_mode_resume); + print_name_offset(m, dev->tick_resume); SEQ_printf(m, "\n"); } } -- cgit v1.3-8-gc7d7 From 77e32c89a7117614ab3d66d20c1088de721abfaa Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Feb 2015 17:21:33 +0530 Subject: clockevents: Manage device's state separately for the core 'enum clock_event_mode' is used for two purposes today: - to pass mode to the driver of clockevent device::set_mode(). - for managing state of the device for clockevents core. For supporting new modes/states we have moved away from the legacy set_mode() callback to new per-mode/state callbacks. New modes/states shouldn't be exposed to the legacy (now OBSOLOTE) callbacks and so we shouldn't add new states to 'enum clock_event_mode'. Lets have separate enums for the two use cases mentioned above. Keep using the earlier enum for legacy set_mode() callback and mark it OBSOLETE. And add another enum to clearly specify the possible states of a clockevent device. This also renames the newly added per-mode callbacks to reflect state changes. We haven't got rid of 'mode' member of 'struct clock_event_device' as it is used by some of the clockevent drivers and it would automatically die down once we migrate those drivers to the new interface. It ('mode') is only updated now for the drivers using the legacy interface. Suggested-by: Peter Zijlstra Suggested-by: Ingo Molnar Signed-off-by: Viresh Kumar Acked-by: Peter Zijlstra Cc: Daniel Lezcano Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Preeti U Murthy Cc: linaro-kernel@lists.linaro.org Cc: linaro-networking@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/b6b0143a8a57bd58352ad35e08c25424c879c0cb.1425037853.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- arch/arm/common/bL_switcher.c | 8 ++-- include/linux/clockchips.h | 44 +++++++++++++------ kernel/time/clockevents.c | 99 +++++++++++++++++++++++-------------------- kernel/time/tick-broadcast.c | 20 ++++----- kernel/time/tick-common.c | 7 +-- kernel/time/tick-oneshot.c | 6 +-- kernel/time/timer_list.c | 12 +++--- 7 files changed, 111 insertions(+), 85 deletions(-) (limited to 'kernel') diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 6eaddc47c43d..d4f970a4d255 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -152,7 +152,7 @@ static int bL_switch_to(unsigned int new_cluster_id) unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster; struct completion inbound_alive; struct tick_device *tdev; - enum clock_event_mode tdev_mode; + enum clock_event_state tdev_state; long volatile *handshake_ptr; int ipi_nr, ret; @@ -223,8 +223,8 @@ static int bL_switch_to(unsigned int new_cluster_id) if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu))) tdev = NULL; if (tdev) { - tdev_mode = tdev->evtdev->mode; - clockevents_set_mode(tdev->evtdev, CLOCK_EVT_MODE_SHUTDOWN); + tdev_state = tdev->evtdev->state; + clockevents_set_state(tdev->evtdev, CLOCK_EVT_STATE_SHUTDOWN); } ret = cpu_pm_enter(); @@ -252,7 +252,7 @@ static int bL_switch_to(unsigned int new_cluster_id) ret = cpu_pm_exit(); if (tdev) { - clockevents_set_mode(tdev->evtdev, tdev_mode); + clockevents_set_state(tdev->evtdev, tdev_state); clockevents_program_event(tdev->evtdev, tdev->evtdev->next_event, 1); } diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index a41749543d48..e20232c3320a 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -32,15 +32,31 @@ enum clock_event_nofitiers { struct clock_event_device; struct module; -/* Clock event mode commands */ +/* Clock event mode commands for legacy ->set_mode(): OBSOLETE */ enum clock_event_mode { CLOCK_EVT_MODE_UNUSED = 0, CLOCK_EVT_MODE_SHUTDOWN, CLOCK_EVT_MODE_PERIODIC, CLOCK_EVT_MODE_ONESHOT, CLOCK_EVT_MODE_RESUME, +}; - /* Legacy ->set_mode() callback doesn't support below modes */ +/* + * Possible states of a clock event device. + * + * DETACHED: Device is not used by clockevents core. Initial state or can be + * reached from SHUTDOWN. + * SHUTDOWN: Device is powered-off. Can be reached from PERIODIC or ONESHOT. + * PERIODIC: Device is programmed to generate events periodically. Can be + * reached from DETACHED or SHUTDOWN. + * ONESHOT: Device is programmed to generate event only once. Can be reached + * from DETACHED or SHUTDOWN. + */ +enum clock_event_state { + CLOCK_EVT_STATE_DETACHED = 0, + CLOCK_EVT_STATE_SHUTDOWN, + CLOCK_EVT_STATE_PERIODIC, + CLOCK_EVT_STATE_ONESHOT, }; /* @@ -80,13 +96,14 @@ enum clock_event_mode { * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) - * @mode: operating mode assigned by the management code + * @mode: operating mode, relevant only to ->set_mode(), OBSOLETE + * @state: current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. - * @set_mode_periodic: switch mode to periodic, if !set_mode - * @set_mode_oneshot: switch mode to oneshot, if !set_mode - * @set_mode_shutdown: switch mode to shutdown, if !set_mode + * @set_state_periodic: switch state to periodic, if !set_mode + * @set_state_oneshot: switch state to oneshot, if !set_mode + * @set_state_shutdown: switch state to shutdown, if !set_mode * @tick_resume: resume clkevt device, if !set_mode * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration @@ -111,20 +128,21 @@ struct clock_event_device { u32 mult; u32 shift; enum clock_event_mode mode; + enum clock_event_state state; unsigned int features; unsigned long retries; /* - * Mode transition callback(s): Only one of the two groups should be + * State transition callback(s): Only one of the two groups should be * defined: * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME. - * - set_mode_{shutdown|periodic|oneshot|resume}(). + * - set_state_{shutdown|periodic|oneshot}(), tick_resume(). */ void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); - int (*set_mode_periodic)(struct clock_event_device *); - int (*set_mode_oneshot)(struct clock_event_device *); - int (*set_mode_shutdown)(struct clock_event_device *); + int (*set_state_periodic)(struct clock_event_device *); + int (*set_state_oneshot)(struct clock_event_device *); + int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); void (*broadcast)(const struct cpumask *mask); @@ -177,8 +195,8 @@ extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); -extern void clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode); +extern void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1b0ea63de69c..6e53e9a0c2e8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -94,44 +94,49 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) } EXPORT_SYMBOL_GPL(clockevent_delta2ns); -static int __clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode) +static int __clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) { /* Transition with legacy set_mode() callback */ if (dev->set_mode) { /* Legacy callback doesn't support new modes */ - if (mode > CLOCK_EVT_MODE_ONESHOT) + if (state > CLOCK_EVT_STATE_ONESHOT) return -ENOSYS; - dev->set_mode(mode, dev); + /* + * 'clock_event_state' and 'clock_event_mode' have 1-to-1 + * mapping until *_ONESHOT, and so a simple cast will work. + */ + dev->set_mode((enum clock_event_mode)state, dev); + dev->mode = (enum clock_event_mode)state; return 0; } if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; - /* Transition with new mode-specific callbacks */ - switch (mode) { - case CLOCK_EVT_MODE_UNUSED: + /* Transition with new state-specific callbacks */ + switch (state) { + case CLOCK_EVT_STATE_DETACHED: /* * This is an internal state, which is guaranteed to go from - * SHUTDOWN to UNUSED. No driver interaction required. + * SHUTDOWN to DETACHED. No driver interaction required. */ return 0; - case CLOCK_EVT_MODE_SHUTDOWN: - return dev->set_mode_shutdown(dev); + case CLOCK_EVT_STATE_SHUTDOWN: + return dev->set_state_shutdown(dev); - case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_STATE_PERIODIC: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_PERIODIC)) return -ENOSYS; - return dev->set_mode_periodic(dev); + return dev->set_state_periodic(dev); - case CLOCK_EVT_MODE_ONESHOT: + case CLOCK_EVT_STATE_ONESHOT: /* Core internal bug */ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT)) return -ENOSYS; - return dev->set_mode_oneshot(dev); + return dev->set_state_oneshot(dev); default: return -ENOSYS; @@ -139,26 +144,26 @@ static int __clockevents_set_mode(struct clock_event_device *dev, } /** - * clockevents_set_mode - set the operating mode of a clock event device + * clockevents_set_state - set the operating state of a clock event device * @dev: device to modify - * @mode: new mode + * @state: new state * * Must be called with interrupts disabled ! */ -void clockevents_set_mode(struct clock_event_device *dev, - enum clock_event_mode mode) +void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state) { - if (dev->mode != mode) { - if (__clockevents_set_mode(dev, mode)) + if (dev->state != state) { + if (__clockevents_set_state(dev, state)) return; - dev->mode = mode; + dev->state = state; /* * A nsec2cyc multiplicator of 0 is invalid and we'd crash * on it, so fix it up and emit a warning: */ - if (mode == CLOCK_EVT_MODE_ONESHOT) { + if (state == CLOCK_EVT_STATE_ONESHOT) { if (unlikely(!dev->mult)) { dev->mult = 1; WARN_ON(1); @@ -173,7 +178,7 @@ void clockevents_set_mode(struct clock_event_device *dev, */ void clockevents_shutdown(struct clock_event_device *dev) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); dev->next_event.tv64 = KTIME_MAX; } @@ -185,13 +190,12 @@ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; - if (dev->set_mode) + if (dev->set_mode) { dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); - else if (dev->tick_resume) - ret = dev->tick_resume(dev); - - if (likely(!ret)) dev->mode = CLOCK_EVT_MODE_RESUME; + } else if (dev->tick_resume) { + ret = dev->tick_resume(dev); + } return ret; } @@ -248,7 +252,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; dev->retries++; @@ -285,7 +289,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) delta = dev->min_delta_ns; dev->next_event = ktime_add_ns(ktime_get(), delta); - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; dev->retries++; @@ -317,7 +321,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, dev->next_event = expires; - if (dev->mode == CLOCK_EVT_MODE_SHUTDOWN) + if (dev->state == CLOCK_EVT_STATE_SHUTDOWN) return 0; /* Shortcut for clockevent devices that can deal with ktime. */ @@ -362,7 +366,7 @@ static int clockevents_replace(struct clock_event_device *ced) struct clock_event_device *dev, *newdev = NULL; list_for_each_entry(dev, &clockevent_devices, list) { - if (dev == ced || dev->mode != CLOCK_EVT_MODE_UNUSED) + if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED) continue; if (!tick_check_replacement(newdev, dev)) @@ -388,7 +392,7 @@ static int clockevents_replace(struct clock_event_device *ced) static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu) { /* Fast track. Device is unused */ - if (ced->mode == CLOCK_EVT_MODE_UNUSED) { + if (ced->state == CLOCK_EVT_STATE_DETACHED) { list_del_init(&ced->list); return 0; } @@ -438,30 +442,30 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind); -/* Sanity check of mode transition callbacks */ +/* Sanity check of state transition callbacks */ static int clockevents_sanity_check(struct clock_event_device *dev) { /* Legacy set_mode() callback */ if (dev->set_mode) { /* We shouldn't be supporting new modes now */ - WARN_ON(dev->set_mode_periodic || dev->set_mode_oneshot || - dev->set_mode_shutdown || dev->tick_resume); + WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || + dev->set_state_shutdown || dev->tick_resume); return 0; } if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; - /* New mode-specific callbacks */ - if (!dev->set_mode_shutdown) + /* New state-specific callbacks */ + if (!dev->set_state_shutdown) return -EINVAL; if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && - !dev->set_mode_periodic) + !dev->set_state_periodic) return -EINVAL; if ((dev->features & CLOCK_EVT_FEAT_ONESHOT) && - !dev->set_mode_oneshot) + !dev->set_state_oneshot) return -EINVAL; return 0; @@ -478,6 +482,9 @@ void clockevents_register_device(struct clock_event_device *dev) BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(clockevents_sanity_check(dev)); + /* Initialize state to DETACHED */ + dev->state = CLOCK_EVT_STATE_DETACHED; + if (!dev->cpumask) { WARN_ON(num_possible_cpus() > 1); dev->cpumask = cpumask_of(smp_processor_id()); @@ -541,11 +548,11 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq) { clockevents_config(dev, freq); - if (dev->mode == CLOCK_EVT_MODE_ONESHOT) + if (dev->state == CLOCK_EVT_STATE_ONESHOT) return clockevents_program_event(dev, dev->next_event, false); - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) - return __clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + if (dev->state == CLOCK_EVT_STATE_PERIODIC) + return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); return 0; } @@ -601,13 +608,13 @@ void clockevents_exchange_device(struct clock_event_device *old, */ if (old) { module_put(old->owner); - clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); + clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED); list_del(&old->list); list_add(&old->list, &clockevents_released); } if (new) { - BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); clockevents_shutdown(new); } local_irq_restore(flags); @@ -693,7 +700,7 @@ int clockevents_notify(unsigned long reason, void *arg) if (cpumask_test_cpu(cpu, dev->cpumask) && cpumask_weight(dev->cpumask) == 1 && !tick_is_broadcast_device(dev)) { - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); + BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); list_del(&dev->list); } } diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 542d5bb5c13d..f0f8ee9dbc28 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -303,7 +303,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * The device is in periodic mode. No reprogramming necessary: */ - if (dev->mode == CLOCK_EVT_MODE_PERIODIC) + if (dev->state == CLOCK_EVT_STATE_PERIODIC) goto unlock; /* @@ -532,8 +532,8 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, { int ret; - if (bc->mode != CLOCK_EVT_MODE_ONESHOT) - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + if (bc->state != CLOCK_EVT_STATE_ONESHOT) + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); ret = clockevents_program_event(bc, expires, force); if (!ret) @@ -543,7 +543,7 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); return 0; } @@ -562,8 +562,8 @@ void tick_check_oneshot_broadcast_this_cpu(void) * switched over, leave the device alone. */ if (td->mode == TICKDEV_MODE_ONESHOT) { - clockevents_set_mode(td->evtdev, - CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(td->evtdev, + CLOCK_EVT_STATE_ONESHOT); } } } @@ -666,7 +666,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, if (dev->next_event.tv64 < bc->next_event.tv64) return; } - clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN); + clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } static void broadcast_move_bc(int deadcpu) @@ -741,7 +741,7 @@ int tick_broadcast_oneshot_control(unsigned long reason) cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -842,7 +842,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { - int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC; bc->event_handler = tick_handle_oneshot_broadcast; @@ -858,7 +858,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_oneshot_mask, tmpmask); if (was_periodic && !cpumask_empty(tmpmask)) { - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_init_next_event(tmpmask, tick_next_period); tick_broadcast_set_event(bc, cpu, tick_next_period, 1); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 5c50664c21d7..a5b877130ae9 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -102,7 +102,7 @@ void tick_handle_periodic(struct clock_event_device *dev) tick_periodic(cpu); - if (dev->mode != CLOCK_EVT_MODE_ONESHOT) + if (dev->state != CLOCK_EVT_STATE_ONESHOT) return; for (;;) { /* @@ -140,7 +140,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) && !tick_broadcast_oneshot_active()) { - clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); + clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC); } else { unsigned long seq; ktime_t next; @@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast) next = tick_next_period; } while (read_seqretry(&jiffies_lock, seq)); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); for (;;) { if (!clockevents_program_event(dev, next, false)) @@ -365,6 +365,7 @@ void tick_shutdown(unsigned int *cpup) * Prevent that the clock events layer tries to call * the set mode function! */ + dev->state = CLOCK_EVT_STATE_DETACHED; dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 7ce740e78e1b..67a64b1670bf 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -38,7 +38,7 @@ void tick_resume_oneshot(void) { struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev); - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(dev, ktime_get(), true); } @@ -50,7 +50,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, ktime_t next_event) { newdev->event_handler = handler; - clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT); clockevents_program_event(newdev, next_event, true); } @@ -81,7 +81,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)) td->mode = TICKDEV_MODE_ONESHOT; dev->event_handler = handler; - clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); + clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT); tick_broadcast_switch_to_oneshot(); return 0; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 2b3e9393034d..05aa5590106a 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -233,21 +233,21 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_mode); SEQ_printf(m, "\n"); } else { - if (dev->set_mode_shutdown) { + if (dev->set_state_shutdown) { SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_mode_shutdown); + print_name_offset(m, dev->set_state_shutdown); SEQ_printf(m, "\n"); } - if (dev->set_mode_periodic) { + if (dev->set_state_periodic) { SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_mode_periodic); + print_name_offset(m, dev->set_state_periodic); SEQ_printf(m, "\n"); } - if (dev->set_mode_oneshot) { + if (dev->set_state_oneshot) { SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_mode_oneshot); + print_name_offset(m, dev->set_state_oneshot); SEQ_printf(m, "\n"); } -- cgit v1.3-8-gc7d7 From de81e64b250d3865a75d221a80b4311e3273670a Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 27 Feb 2015 17:21:34 +0530 Subject: clockevents: Don't validate dev->mode against CLOCK_EVT_MODE_UNUSED for new interface It was a requirement in the legacy interface that drivers must initialize ->mode field to 'CLOCK_EVT_MODE_UNUSED'. This field isn't used anymore by the new interface and so should be only checked for the legacy interface. Probably it can be dropped as well as core doesn't rely on it anymore, but lets keep it to support legacy interface. Signed-off-by: Viresh Kumar Acked-by: Peter Zijlstra Cc: Daniel Lezcano Cc: Frederic Weisbecker Cc: Kevin Hilman Cc: Peter Zijlstra Cc: Preeti U Murthy Cc: linaro-kernel@lists.linaro.org Cc: linaro-networking@linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: http://lkml.kernel.org/r/c6604fa1a77fe1fc8dcab87769857228fb1dadd5.1425037853.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/clockevents.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 6e53e9a0c2e8..73689df1e4b8 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -450,6 +450,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev) /* We shouldn't be supporting new modes now */ WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || dev->set_state_shutdown || dev->tick_resume); + + BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); return 0; } @@ -479,7 +481,6 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); BUG_ON(clockevents_sanity_check(dev)); /* Initialize state to DETACHED */ -- cgit v1.3-8-gc7d7 From 608cd71a9c7c9db76e78a792c5a4101e12fea43f Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 26 Mar 2015 19:53:57 -0700 Subject: tc: bpf: generalize pedit action existing TC action 'pedit' can munge any bits of the packet. Generalize it for use in bpf programs attached as cls_bpf and act_bpf via bpf_skb_store_bytes() helper function. Signed-off-by: Alexei Starovoitov Reviewed-by: Jiri Pirko Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/bpf.h | 1 + include/uapi/linux/bpf.h | 1 + kernel/bpf/verifier.c | 2 ++ net/core/filter.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 73 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 280a315de8d6..d5cda067115a 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -59,6 +59,7 @@ enum bpf_arg_type { ARG_PTR_TO_STACK, /* any pointer to eBPF program stack */ ARG_CONST_STACK_SIZE, /* number of bytes accessed from stack */ + ARG_PTR_TO_CTX, /* pointer to context */ ARG_ANYTHING, /* any (initialized) argument is ok */ }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 27dc4ec58840..74aab6e0d964 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -168,6 +168,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */ BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */ + BPF_FUNC_skb_store_bytes, /* int skb_store_bytes(skb, offset, from, len) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 0e714f799ec0..630a7bac1e51 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -773,6 +773,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; + } else if (arg_type == ARG_PTR_TO_CTX) { + expected_type = PTR_TO_CTX; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; diff --git a/net/core/filter.c b/net/core/filter.c index 32f43c59908c..444a07e4f68d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1175,6 +1175,56 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) return 0; } +static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + struct sk_buff *skb = (struct sk_buff *) (long) r1; + unsigned int offset = (unsigned int) r2; + void *from = (void *) (long) r3; + unsigned int len = (unsigned int) r4; + char buf[16]; + void *ptr; + + /* bpf verifier guarantees that: + * 'from' pointer points to bpf program stack + * 'len' bytes of it were initialized + * 'len' > 0 + * 'skb' is a valid pointer to 'struct sk_buff' + * + * so check for invalid 'offset' and too large 'len' + */ + if (offset > 0xffff || len > sizeof(buf)) + return -EFAULT; + + if (skb_cloned(skb) && !skb_clone_writable(skb, offset + len)) + return -EFAULT; + + ptr = skb_header_pointer(skb, offset, len, buf); + if (unlikely(!ptr)) + return -EFAULT; + + skb_postpull_rcsum(skb, ptr, len); + + memcpy(ptr, from, len); + + if (ptr == buf) + /* skb_store_bits cannot return -EFAULT here */ + skb_store_bits(skb, offset, ptr, len); + + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0)); + return 0; +} + +const struct bpf_func_proto bpf_skb_store_bytes_proto = { + .func = bpf_skb_store_bytes, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, + .arg3_type = ARG_PTR_TO_STACK, + .arg4_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto * sk_filter_func_proto(enum bpf_func_id func_id) { @@ -1194,6 +1244,17 @@ sk_filter_func_proto(enum bpf_func_id func_id) } } +static const struct bpf_func_proto * +tc_cls_act_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_skb_store_bytes: + return &bpf_skb_store_bytes_proto; + default: + return sk_filter_func_proto(func_id); + } +} + static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type) { @@ -1270,18 +1331,24 @@ static const struct bpf_verifier_ops sk_filter_ops = { .convert_ctx_access = sk_filter_convert_ctx_access, }; +static const struct bpf_verifier_ops tc_cls_act_ops = { + .get_func_proto = tc_cls_act_func_proto, + .is_valid_access = sk_filter_is_valid_access, + .convert_ctx_access = sk_filter_convert_ctx_access, +}; + static struct bpf_prog_type_list sk_filter_type __read_mostly = { .ops = &sk_filter_ops, .type = BPF_PROG_TYPE_SOCKET_FILTER, }; static struct bpf_prog_type_list sched_cls_type __read_mostly = { - .ops = &sk_filter_ops, + .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_CLS, }; static struct bpf_prog_type_list sched_act_type __read_mostly = { - .ops = &sk_filter_ops, + .ops = &tc_cls_act_ops, .type = BPF_PROG_TYPE_SCHED_ACT, }; -- cgit v1.3-8-gc7d7 From d631c8cceb1d1d06f372878935949d421585186b Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Fri, 27 Mar 2015 17:39:49 -0400 Subject: ring-buffer: Remove duplicate use of '&' in recursive code A clean up of the recursive protection code changed val = this_cpu_read(current_context); val--; val &= this_cpu_read(current_context); to val = this_cpu_read(current_context); val &= val & (val - 1); Which has a duplicate use of '&' as the above is the same as val = val & (val - 1); Actually, it would be best to remove that line altogether and just add it to where it is used. And Christoph even mentioned that it can be further compacted to just a single line: __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); Link: http://lkml.kernel.org/alpine.DEB.2.11.1503271423580.23114@gentwo.org Suggested-by: Christoph Lameter Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 922048a0f7ea..0315d43176d8 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2703,10 +2703,7 @@ static __always_inline int trace_recursive_lock(void) static __always_inline void trace_recursive_unlock(void) { - unsigned int val = __this_cpu_read(current_context); - - val &= val & (val - 1); - __this_cpu_write(current_context, val); + __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); } #else -- cgit v1.3-8-gc7d7 From 9f083b74df3a7eaa100b456f2dc195512daf728e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:05:19 +0100 Subject: clockevents: Remove CONFIG_GENERIC_CLOCKEVENTS_BUILD This option was for simpler migration to the clock events code. Most architectures have been converted and the option has been disfunctional as a standalone option for quite some time. Remove it. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/5021859.jl9OC1medj@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 9 +++------ kernel/time/Kconfig | 6 ------ kernel/time/Makefile | 6 ++---- kernel/time/clockevents.c | 3 --- kernel/time/tick-internal.h | 4 ++-- 5 files changed, 7 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index e20232c3320a..57975131dac1 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -22,7 +22,7 @@ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_CPU_DEAD, }; -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD +#ifdef CONFIG_GENERIC_CLOCKEVENTS #include #include @@ -229,13 +229,9 @@ static inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) {}; #endif -#ifdef CONFIG_GENERIC_CLOCKEVENTS extern int clockevents_notify(unsigned long reason, void *arg); -#else -static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } -#endif -#else /* CONFIG_GENERIC_CLOCKEVENTS_BUILD */ +#else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void clockevents_suspend(void) {} static inline void clockevents_resume(void) {} @@ -243,6 +239,7 @@ static inline void clockevents_resume(void) {} static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } static inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) {}; +static inline int clockevents_notify(unsigned long reason, void *arg) { return 0; } #endif diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index d626dc98e8df..579ce1b929af 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -33,12 +33,6 @@ config ARCH_USES_GETTIMEOFFSET config GENERIC_CLOCKEVENTS bool -# Migration helper. Builds, but does not invoke -config GENERIC_CLOCKEVENTS_BUILD - bool - default y - depends on GENERIC_CLOCKEVENTS - # Architecture can handle broadcast in a driver-agnostic way config ARCH_HAS_TICK_BROADCAST bool diff --git a/kernel/time/Makefile b/kernel/time/Makefile index c09c07817d7a..01f0312419b3 100644 --- a/kernel/time/Makefile +++ b/kernel/time/Makefile @@ -2,15 +2,13 @@ obj-y += time.o timer.o hrtimer.o itimer.o posix-timers.o posix-cpu-timers.o obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o obj-y += timeconv.o timecounter.o posix-clock.o alarmtimer.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o -obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS) += clockevents.o tick-common.o ifeq ($(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST),y) obj-y += tick-broadcast.o obj-$(CONFIG_TICK_ONESHOT) += tick-broadcast-hrtimer.o endif obj-$(CONFIG_GENERIC_SCHED_CLOCK) += sched_clock.o -obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o -obj-$(CONFIG_TICK_ONESHOT) += tick-sched.o +obj-$(CONFIG_TICK_ONESHOT) += tick-oneshot.o tick-sched.o obj-$(CONFIG_TIMER_STATS) += timer_stats.o obj-$(CONFIG_DEBUG_FS) += timekeeping_debug.o obj-$(CONFIG_TEST_UDELAY) += test_udelay.o diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 73689df1e4b8..3531beecbe95 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -645,7 +645,6 @@ void clockevents_resume(void) dev->resume(dev); } -#ifdef CONFIG_GENERIC_CLOCKEVENTS /** * clockevents_notify - notification about relevant events * Returns 0 on success, any other value on error @@ -831,5 +830,3 @@ static int __init clockevents_init_sysfs(void) } device_initcall(clockevents_init_sysfs); #endif /* SYSFS */ - -#endif /* GENERIC_CLOCK_EVENTS */ diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 98700e4a2000..c7b75bec27f2 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -10,7 +10,7 @@ extern seqlock_t jiffies_lock; #define CS_NAME_LEN 32 -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD +#ifdef CONFIG_GENERIC_CLOCKEVENTS #define TICK_DO_TIMER_NONE -1 #define TICK_DO_TIMER_BOOT -2 @@ -167,7 +167,7 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); -#endif +#endif /* GENERIC_CLOCKEVENTS */ extern void do_timer(unsigned long ticks); extern void update_wall_time(void); -- cgit v1.3-8-gc7d7 From bfb83b27519aa7ed9510f601a8f825a2c1484bc2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:06:04 +0100 Subject: tick: Move clocksource related stuff to timekeeping.h Move clocksource related stuff to timekeeping.h and remove the pointless include from ntp.c Signed-off-by: Thomas Gleixner [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/2714218.nM5AEfAHj0@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 2 +- kernel/time/jiffies.c | 2 +- kernel/time/ntp.c | 1 - kernel/time/tick-internal.h | 6 ------ kernel/time/timekeeping.h | 7 +++++++ 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c3be3c71bbad..8b4010f0b1b4 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,7 +31,7 @@ #include #include -#include "tick-internal.h" +#include "timekeeping.h" #include "timekeeping_internal.h" /** diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index c4bb518725b5..347fecf86a3f 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -25,7 +25,7 @@ #include #include -#include "tick-internal.h" +#include "timekeeping.h" /* The Jiffies based clocksource is the lowest common * denominator clock source which should function on diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 0f60b08a4f07..9ad60d028508 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -17,7 +17,6 @@ #include #include -#include "tick-internal.h" #include "ntp_internal.h" /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index c7b75bec27f2..cba52140a298 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -6,10 +6,6 @@ #include "timekeeping.h" -extern seqlock_t jiffies_lock; - -#define CS_NAME_LEN 32 - #ifdef CONFIG_GENERIC_CLOCKEVENTS #define TICK_DO_TIMER_NONE -1 @@ -169,5 +165,3 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); #endif /* GENERIC_CLOCKEVENTS */ -extern void do_timer(unsigned long ticks); -extern void update_wall_time(void); diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 1d91416055d5..ead8794b9a4e 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -19,4 +19,11 @@ extern void timekeeping_clocktai(struct timespec *ts); extern int timekeeping_suspend(void); extern void timekeeping_resume(void); +extern void do_timer(unsigned long ticks); +extern void update_wall_time(void); + +extern seqlock_t jiffies_lock; + +#define CS_NAME_LEN 32 + #endif -- cgit v1.3-8-gc7d7 From b7475eb599ddb2e8cab2dc86ff38a9507463ad6b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:06:47 +0100 Subject: tick: Simplify tick-internal.h tick-internal.h is pretty confusing as a lot of the stub inlines are there several times. Distangle the maze and make clear functional sections. Signed-off-by: Thomas Gleixner [ rjw: Subject ] Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/16068264.vcNp79HLaT@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 145 +++++++++++++++----------------------------- 1 file changed, 49 insertions(+), 96 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index cba52140a298..d86eb8d485e9 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -27,14 +27,19 @@ extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); -extern void clockevents_shutdown(struct clock_event_device *dev); extern int clockevents_tick_resume(struct clock_event_device *dev); +/* Check, if the device is functional or a dummy for broadcast */ +static inline int tick_device_is_functional(struct clock_event_device *dev) +{ + return !(dev->features & CLOCK_EVT_FEAT_DUMMY); +} +extern void clockevents_shutdown(struct clock_event_device *dev); +extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +#endif /* GENERIC_CLOCKEVENTS */ -/* - * NO_HZ / high resolution timer shared code - */ +/* Oneshot related functions */ #ifdef CONFIG_TICK_ONESHOT extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), @@ -43,69 +48,19 @@ extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST -extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern int tick_broadcast_oneshot_control(unsigned long reason); -extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); -extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); -extern int tick_broadcast_oneshot_active(void); -extern void tick_check_oneshot_broadcast_this_cpu(void); -bool tick_broadcast_oneshot_available(void); -# else /* BROADCAST */ -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - BUG(); -} -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } -static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } -static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline void tick_check_oneshot_broadcast_this_cpu(void) { } -static inline bool tick_broadcast_oneshot_available(void) { return true; } -# endif /* !BROADCAST */ - +static inline bool tick_oneshot_possible(void) { return true; } #else /* !ONESHOT */ static inline void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), - ktime_t nextevt) -{ - BUG(); -} -static inline void tick_resume_oneshot(void) -{ - BUG(); -} -static inline int tick_program_event(ktime_t expires, int force) -{ - return 0; -} + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } static inline void tick_oneshot_notify(void) { } -static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) -{ - BUG(); -} -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } -static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) -{ - return 0; -} -static inline int tick_broadcast_oneshot_active(void) { return 0; } -static inline bool tick_broadcast_oneshot_available(void) { return false; } +static inline bool tick_oneshot_possible(void) { return false; } #endif /* !TICK_ONESHOT */ -/* NO_HZ_FULL internal */ -#ifdef CONFIG_NO_HZ_FULL -extern void tick_nohz_init(void); -# else -static inline void tick_nohz_init(void) { } -#endif - -/* - * Broadcasting support - */ +/* Broadcasting support */ #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); @@ -115,53 +70,51 @@ extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); -extern void -tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); -int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); - +extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); +extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); #else /* !BROADCAST */ - -static inline void tick_install_broadcast_device(struct clock_event_device *dev) -{ -} - -static inline int tick_is_broadcast_device(struct clock_event_device *dev) -{ - return 0; -} -static inline int tick_device_uses_broadcast(struct clock_event_device *dev, - int cpu) -{ - return 0; -} +static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } +static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } +static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline int tick_resume_broadcast(void) { return 0; } static inline void tick_broadcast_init(void) { } -static inline int tick_broadcast_update_freq(struct clock_event_device *dev, - u32 freq) { return -ENODEV; } +static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } -/* - * Set the periodic handler in non broadcast mode - */ -static inline void tick_set_periodic_handler(struct clock_event_device *dev, - int broadcast) +/* Set the periodic handler in non broadcast mode */ +static inline void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) { dev->event_handler = tick_handle_periodic; } #endif /* !BROADCAST */ -/* - * Check, if the device is functional or a dummy for broadcast - */ -static inline int tick_device_is_functional(struct clock_event_device *dev) -{ - return !(dev->features & CLOCK_EVT_FEAT_DUMMY); -} - -int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); - -#endif /* GENERIC_CLOCKEVENTS */ +/* Functions related to oneshot broadcasting */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_control(unsigned long reason); +extern void tick_broadcast_switch_to_oneshot(void); +extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); +extern int tick_broadcast_oneshot_active(void); +extern void tick_check_oneshot_broadcast_this_cpu(void); +bool tick_broadcast_oneshot_available(void); +#else /* BROADCAST && ONESHOT */ +static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } +static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } +static inline void tick_broadcast_switch_to_oneshot(void) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } +static inline int tick_broadcast_oneshot_active(void) { return 0; } +static inline void tick_check_oneshot_broadcast_this_cpu(void) { } +static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } +#endif /* !BROADCAST && ONESHOT */ +/* NO_HZ_FULL internal */ +#ifdef CONFIG_NO_HZ_FULL +extern void tick_nohz_init(void); +# else +static inline void tick_nohz_init(void) { } +#endif -- cgit v1.3-8-gc7d7 From c1797baf6880174f899ce3960d0598f5bbeeb7ff Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:07:37 +0100 Subject: tick: Move core only declarations and functions to core No point to expose everything to the world. People just believe such functions can be abused for whatever purposes. Sigh. Signed-off-by: Thomas Gleixner [ Rebased on top of 4.0-rc5 ] Signed-off-by: Rafael J. Wysocki Cc: Nicolas Pitre Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/28017337.VbCUc39Gme@vostro.rjw.lan [ Merged to latest timers/core ] Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 15 ++--- include/linux/tick.h | 134 ++++++-------------------------------------- kernel/time/clocksource.c | 2 +- kernel/time/hrtimer.c | 2 +- kernel/time/tick-internal.h | 15 +++++ kernel/time/tick-sched.c | 7 ++- kernel/time/tick-sched.h | 64 +++++++++++++++++++++ kernel/time/timer_list.c | 2 +- 8 files changed, 112 insertions(+), 129 deletions(-) create mode 100644 kernel/time/tick-sched.h (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 57975131dac1..bc3af821350b 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -193,15 +193,6 @@ extern void clockevents_config_and_register(struct clock_event_device *dev, extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); -extern void clockevents_exchange_device(struct clock_event_device *old, - struct clock_event_device *new); -extern void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state); -extern int clockevents_program_event(struct clock_event_device *dev, - ktime_t expires, bool force); - -extern void clockevents_handle_noop(struct clock_event_device *dev); - static inline void clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) { @@ -209,6 +200,12 @@ clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) freq, minsec); } +/* Should be core only, but is abused by arm bl_switcher */ +extern void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); + extern void clockevents_suspend(void); extern void clockevents_resume(void); diff --git a/include/linux/tick.h b/include/linux/tick.h index 9c085dc12ae9..f9a2d2687a46 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -1,7 +1,5 @@ -/* linux/include/linux/tick.h - * - * This file contains the structure definitions for tick related functions - * +/* + * Tick related global functions */ #ifndef _LINUX_TICK_H #define _LINUX_TICK_H @@ -9,13 +7,12 @@ #include #include #include -#include #include #include #include +/* ARM BL switcher abuse support */ #ifdef CONFIG_GENERIC_CLOCKEVENTS - enum tick_device_mode { TICKDEV_MODE_PERIODIC, TICKDEV_MODE_ONESHOT, @@ -25,133 +22,38 @@ struct tick_device { struct clock_event_device *evtdev; enum tick_device_mode mode; }; - -enum tick_nohz_mode { - NOHZ_MODE_INACTIVE, - NOHZ_MODE_LOWRES, - NOHZ_MODE_HIGHRES, -}; - -/** - * struct tick_sched - sched tick emulation and no idle tick control/stats - * @sched_timer: hrtimer to schedule the periodic tick in high - * resolution mode - * @last_tick: Store the last tick expiry time when the tick - * timer is modified for nohz sleeps. This is necessary - * to resume the tick timer operation in the timeline - * when the CPU returns from nohz sleep. - * @tick_stopped: Indicator that the idle tick has been stopped - * @idle_jiffies: jiffies at the entry to idle for idle time accounting - * @idle_calls: Total number of idle calls - * @idle_sleeps: Number of idle calls, where the sched tick was stopped - * @idle_entrytime: Time when the idle call was entered - * @idle_waketime: Time when the idle was interrupted - * @idle_exittime: Time when the idle state was left - * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped - * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding - * @sleep_length: Duration of the current idle sleep - * @do_timer_lst: CPU was the last one doing do_timer before going idle - */ -struct tick_sched { - struct hrtimer sched_timer; - unsigned long check_clocks; - enum tick_nohz_mode nohz_mode; - ktime_t last_tick; - int inidle; - int tick_stopped; - unsigned long idle_jiffies; - unsigned long idle_calls; - unsigned long idle_sleeps; - int idle_active; - ktime_t idle_entrytime; - ktime_t idle_waketime; - ktime_t idle_exittime; - ktime_t idle_sleeptime; - ktime_t iowait_sleeptime; - ktime_t sleep_length; - unsigned long last_jiffies; - unsigned long next_jiffies; - ktime_t idle_expires; - int do_timer_last; -}; - -extern void __init tick_init(void); -extern int tick_is_oneshot_available(void); extern struct tick_device *tick_get_device(int cpu); +#endif +#ifdef CONFIG_GENERIC_CLOCKEVENTS +extern void __init tick_init(void); extern void tick_freeze(void); extern void tick_unfreeze(void); +#else /* CONFIG_GENERIC_CLOCKEVENTS */ +static inline void tick_init(void) { } +static inline void tick_freeze(void) { } +static inline void tick_unfreeze(void) { } +#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ -# ifdef CONFIG_HIGH_RES_TIMERS -extern int tick_init_highres(void); -extern int tick_program_event(ktime_t expires, int force); -extern void tick_setup_sched_timer(void); -# endif - -# if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS -extern void tick_cancel_sched_timer(int cpu); -# else -static inline void tick_cancel_sched_timer(int cpu) { } -# endif - -# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST -extern struct tick_device *tick_get_broadcast_device(void); -extern struct cpumask *tick_get_broadcast_mask(void); - -# ifdef CONFIG_TICK_ONESHOT -extern struct cpumask *tick_get_broadcast_oneshot_mask(void); -# endif - -# endif /* BROADCAST */ - -# ifdef CONFIG_TICK_ONESHOT -extern void tick_clock_notify(void); -extern int tick_check_oneshot_change(int allow_nohz); -extern struct tick_sched *tick_get_tick_sched(int cpu); +#ifdef CONFIG_TICK_ONESHOT extern void tick_irq_enter(void); -extern int tick_oneshot_mode_active(void); # ifndef arch_needs_cpu # define arch_needs_cpu() (0) # endif # else -static inline void tick_clock_notify(void) { } -static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } -static inline void tick_irq_enter(void) { } -static inline int tick_oneshot_mode_active(void) { return 0; } -# endif - -#else /* CONFIG_GENERIC_CLOCKEVENTS */ -static inline void tick_init(void) { } -static inline void tick_freeze(void) { } -static inline void tick_unfreeze(void) { } -static inline void tick_cancel_sched_timer(int cpu) { } -static inline void tick_clock_notify(void) { } -static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } static inline void tick_irq_enter(void) { } -static inline int tick_oneshot_mode_active(void) { return 0; } -#endif /* !CONFIG_GENERIC_CLOCKEVENTS */ - -# ifdef CONFIG_NO_HZ_COMMON -DECLARE_PER_CPU(struct tick_sched, tick_cpu_sched); - -static inline int tick_nohz_tick_stopped(void) -{ - return __this_cpu_read(tick_cpu_sched.tick_stopped); -} +#endif +#ifdef CONFIG_NO_HZ_COMMON +extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); - -# else /* !CONFIG_NO_HZ_COMMON */ -static inline int tick_nohz_tick_stopped(void) -{ - return 0; -} - +#else /* !CONFIG_NO_HZ_COMMON */ +static inline int tick_nohz_tick_stopped(void) { return 0; } static inline void tick_nohz_idle_enter(void) { } static inline void tick_nohz_idle_exit(void) { } @@ -163,7 +65,7 @@ static inline ktime_t tick_nohz_get_sleep_length(void) } static inline u64 get_cpu_idle_time_us(int cpu, u64 *unused) { return -1; } static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; } -# endif /* !CONFIG_NO_HZ_COMMON */ +#endif /* !CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL extern bool tick_nohz_full_running; diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 8b4010f0b1b4..c3be3c71bbad 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -31,7 +31,7 @@ #include #include -#include "timekeeping.h" +#include "tick-internal.h" #include "timekeeping_internal.h" /** diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index bee0c1f78091..721d29b99d10 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -54,7 +54,7 @@ #include -#include "timekeeping.h" +#include "tick-internal.h" /* * The timer bases: diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index d86eb8d485e9..dd2c45d057b9 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -5,6 +5,7 @@ #include #include "timekeeping.h" +#include "tick-sched.h" #ifdef CONFIG_GENERIC_CLOCKEVENTS @@ -26,6 +27,7 @@ extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); +extern int tick_is_oneshot_available(void); extern int clockevents_tick_resume(struct clock_event_device *dev); /* Check, if the device is functional or a dummy for broadcast */ @@ -35,6 +37,9 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) } extern void clockevents_shutdown(struct clock_event_device *dev); +extern void clockevents_exchange_device(struct clock_event_device *old, + struct clock_event_device *new); +extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); #endif /* GENERIC_CLOCKEVENTS */ @@ -49,6 +54,10 @@ extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); extern void tick_resume_oneshot(void); static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); #else /* !ONESHOT */ static inline void tick_setup_oneshot(struct clock_event_device *newdev, @@ -58,6 +67,9 @@ static inline void tick_resume_oneshot(void) { BUG(); } static inline int tick_program_event(ktime_t expires, int force) { return 0; } static inline void tick_oneshot_notify(void) { } static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } #endif /* !TICK_ONESHOT */ /* Broadcasting support */ @@ -72,6 +84,8 @@ extern int tick_resume_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); +extern struct tick_device *tick_get_broadcast_device(void); +extern struct cpumask *tick_get_broadcast_mask(void); #else /* !BROADCAST */ static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } @@ -101,6 +115,7 @@ extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); +extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* BROADCAST && ONESHOT */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index a4c4edac4528..914259128145 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -34,7 +34,7 @@ /* * Per cpu nohz control structure */ -DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); +static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); /* * The time, when the last jiffy update happened. Protected by jiffies_lock. @@ -416,6 +416,11 @@ static int __init setup_tick_nohz(char *str) __setup("nohz=", setup_tick_nohz); +int tick_nohz_tick_stopped(void) +{ + return __this_cpu_read(tick_cpu_sched.tick_stopped); +} + /** * tick_nohz_update_jiffies - update jiffies when idle was interrupted * diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h new file mode 100644 index 000000000000..930743249127 --- /dev/null +++ b/kernel/time/tick-sched.h @@ -0,0 +1,64 @@ +#ifndef _TICK_SCHED_H +#define _TICK_SCHED_H + +#include + +enum tick_nohz_mode { + NOHZ_MODE_INACTIVE, + NOHZ_MODE_LOWRES, + NOHZ_MODE_HIGHRES, +}; + +/** + * struct tick_sched - sched tick emulation and no idle tick control/stats + * @sched_timer: hrtimer to schedule the periodic tick in high + * resolution mode + * @last_tick: Store the last tick expiry time when the tick + * timer is modified for nohz sleeps. This is necessary + * to resume the tick timer operation in the timeline + * when the CPU returns from nohz sleep. + * @tick_stopped: Indicator that the idle tick has been stopped + * @idle_jiffies: jiffies at the entry to idle for idle time accounting + * @idle_calls: Total number of idle calls + * @idle_sleeps: Number of idle calls, where the sched tick was stopped + * @idle_entrytime: Time when the idle call was entered + * @idle_waketime: Time when the idle was interrupted + * @idle_exittime: Time when the idle state was left + * @idle_sleeptime: Sum of the time slept in idle with sched tick stopped + * @iowait_sleeptime: Sum of the time slept in idle with sched tick stopped, with IO outstanding + * @sleep_length: Duration of the current idle sleep + * @do_timer_lst: CPU was the last one doing do_timer before going idle + */ +struct tick_sched { + struct hrtimer sched_timer; + unsigned long check_clocks; + enum tick_nohz_mode nohz_mode; + ktime_t last_tick; + int inidle; + int tick_stopped; + unsigned long idle_jiffies; + unsigned long idle_calls; + unsigned long idle_sleeps; + int idle_active; + ktime_t idle_entrytime; + ktime_t idle_waketime; + ktime_t idle_exittime; + ktime_t idle_sleeptime; + ktime_t iowait_sleeptime; + ktime_t sleep_length; + unsigned long last_jiffies; + unsigned long next_jiffies; + ktime_t idle_expires; + int do_timer_last; +}; + +extern struct tick_sched *tick_get_tick_sched(int cpu); + +extern void tick_setup_sched_timer(void); +#if defined CONFIG_NO_HZ_COMMON || defined CONFIG_HIGH_RES_TIMERS +extern void tick_cancel_sched_timer(int cpu); +#else +static inline void tick_cancel_sched_timer(int cpu) { } +#endif + +#endif diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 05aa5590106a..e878c2e0ba45 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -16,10 +16,10 @@ #include #include #include -#include #include +#include "tick-internal.h" struct timer_list_iter { int cpu; -- cgit v1.3-8-gc7d7 From db6f672ef11d7a3c5aa128a3c3e57c92580a25f7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:08:27 +0100 Subject: clockevents: Remove extra local_irq_save() in clockevents_exchange_device() Called with 'clockevents_lock' held and interrupts disabled already. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51005827.yXt5tjZMBs@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/clockevents.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3531beecbe95..b73002718536 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -595,14 +595,12 @@ void clockevents_handle_noop(struct clock_event_device *dev) * @old: device to release (can be NULL) * @new: device to request (can be NULL) * - * Called from the notifier chain. clockevents_lock is held already + * Called from various tick functions with clockevents_lock held and + * interrupts disabled. */ void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new) { - unsigned long flags; - - local_irq_save(flags); /* * Caller releases a clock event device. We queue it into the * released list and do a notify add later. @@ -618,7 +616,6 @@ void clockevents_exchange_device(struct clock_event_device *old, BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED); clockevents_shutdown(new); } - local_irq_restore(flags); } /** -- cgit v1.3-8-gc7d7 From 4ffee521f36390c7720d493591b764ca35c8030b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:09:16 +0100 Subject: clockevents: Make suspend/resume calls explicit clockevents_notify() is a leftover from the early design of the clockevents facility. It's really not a notification mechanism, it's a multiplex call. We are way better off to have explicit calls instead of this monstrosity. Split out the suspend/resume() calls and invoke them directly from the call sites. No locking required at this point because these calls happen with interrupts disabled and a single cpu online. Signed-off-by: Thomas Gleixner [ Rebased on top of 4.0-rc5. ] Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/713674030.jVm1qaHuPf@vostro.rjw.lan [ Rebased on top of latest timers/core. ] Signed-off-by: Ingo Molnar --- arch/x86/xen/suspend.c | 11 ++++------- include/linux/clockchips.h | 2 -- include/linux/tick.h | 3 +++ kernel/time/clockevents.c | 9 --------- kernel/time/tick-common.c | 26 +++++++++++++++++++++++--- kernel/time/tick-internal.h | 3 ++- kernel/time/timekeeping.c | 6 ++---- 7 files changed, 34 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index c4df9dbd63b7..033e428581f4 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -1,5 +1,5 @@ #include -#include +#include #include #include @@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled) static void xen_vcpu_notify_restore(void *data) { - unsigned long reason = (unsigned long)data; - /* Boot processor notified via generic timekeeping_resume() */ - if ( smp_processor_id() == 0) + if (smp_processor_id() == 0) return; - clockevents_notify(reason, NULL); + tick_resume(); } void xen_arch_resume(void) { - on_each_cpu(xen_vcpu_notify_restore, - (void *)CLOCK_EVT_NOTIFY_RESUME, 1); + on_each_cpu(xen_vcpu_notify_restore, NULL, 1); } diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index bc3af821350b..50ce9750754f 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -16,8 +16,6 @@ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_BROADCAST_FORCE, CLOCK_EVT_NOTIFY_BROADCAST_ENTER, CLOCK_EVT_NOTIFY_BROADCAST_EXIT, - CLOCK_EVT_NOTIFY_SUSPEND, - CLOCK_EVT_NOTIFY_RESUME, CLOCK_EVT_NOTIFY_CPU_DYING, CLOCK_EVT_NOTIFY_CPU_DEAD, }; diff --git a/include/linux/tick.h b/include/linux/tick.h index f9a2d2687a46..7e07e0e3d898 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -29,10 +29,13 @@ extern struct tick_device *tick_get_device(int cpu); extern void __init tick_init(void); extern void tick_freeze(void); extern void tick_unfreeze(void); +/* Should be core only, but XEN resume magic abuses this interface */ +extern void tick_resume(void); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_freeze(void) { } static inline void tick_unfreeze(void) { } +static inline void tick_resume(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #ifdef CONFIG_TICK_ONESHOT diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index b73002718536..7af614829da1 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -670,15 +670,6 @@ int clockevents_notify(unsigned long reason, void *arg) tick_handover_do_timer(arg); break; - case CLOCK_EVT_NOTIFY_SUSPEND: - tick_suspend(); - tick_suspend_broadcast(); - break; - - case CLOCK_EVT_NOTIFY_RESUME: - tick_resume(); - break; - case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(arg); tick_shutdown_broadcast(arg); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index a5b877130ae9..1a60c2ae96a8 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -373,18 +373,39 @@ void tick_shutdown(unsigned int *cpup) } } +/** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ void tick_suspend(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); + tick_suspend_broadcast(); } +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ void tick_resume(void) { - struct tick_device *td = this_cpu_ptr(&tick_cpu_device); - int broadcast = tick_resume_broadcast(); + struct tick_device *td; + int broadcast; + broadcast = tick_resume_broadcast(); + td = this_cpu_ptr(&tick_cpu_device); clockevents_tick_resume(td->evtdev); if (!broadcast) { @@ -416,7 +437,6 @@ void tick_freeze(void) timekeeping_suspend(); } else { tick_suspend(); - tick_suspend_broadcast(); } raw_spin_unlock(&tick_freeze_lock); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index dd2c45d057b9..85a957195bf6 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -23,7 +23,6 @@ extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); -extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); @@ -42,6 +41,8 @@ extern void clockevents_exchange_device(struct clock_event_device *old, extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); +#else +static inline void tick_suspend(void) { } #endif /* GENERIC_CLOCKEVENTS */ /* Oneshot related functions */ diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index c3fcff06d30a..5b12292b343a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1389,9 +1389,7 @@ void timekeeping_resume(void) touch_softlockup_watchdog(); - clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL); - - /* Resume hrtimers */ + tick_resume(); hrtimers_resume(); } @@ -1444,7 +1442,7 @@ int timekeeping_suspend(void) write_seqcount_end(&tk_core.seq); raw_spin_unlock_irqrestore(&timekeeper_lock, flags); - clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL); + tick_suspend(); clocksource_suspend(); clockevents_suspend(); -- cgit v1.3-8-gc7d7 From 080873ce2d1abd8c0a2b8c87bfa0762546a6b713 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:09:55 +0100 Subject: tick: Make tick_resume_broadcast_oneshot() static Solely used in tick-broadcast.c and the return value is hardcoded 0. Make it static and void. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1689058.QkHYDJSRKu@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 7 ++++--- kernel/time/tick-internal.h | 2 -- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f0f8ee9dbc28..60e6c23ce1c7 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -37,8 +37,10 @@ static int tick_broadcast_force; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc); #else static inline void tick_broadcast_clear_oneshot(int cpu) { } +static inline void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { } #endif /* @@ -475,7 +477,7 @@ int tick_resume_broadcast(void) break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) - broadcast = tick_resume_broadcast_oneshot(bc); + tick_resume_broadcast_oneshot(bc); break; } } @@ -541,10 +543,9 @@ static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu, return ret; } -int tick_resume_broadcast_oneshot(struct clock_event_device *bc) +static void tick_resume_broadcast_oneshot(struct clock_event_device *bc) { clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT); - return 0; } /* diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 85a957195bf6..5c9f0eec56b2 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -112,7 +112,6 @@ extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); -extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); @@ -122,7 +121,6 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } -static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) { return 0; } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } -- cgit v1.3-8-gc7d7 From f46481d0a7cb942b84145acb80ad43bdb1ff8eb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:11:04 +0100 Subject: tick/xen: Provide and use tick_suspend_local() and tick_resume_local() Xen calls on every cpu into tick_resume() which is just wrong. tick_resume() is for the syscore global suspend/resume invocation. What XEN really wants is a per cpu local resume function. Provide a tick_resume_local() function and use it in XEN. Also provide a complementary tick_suspend_local() and modify tick_unfreeze() and tick_freeze(), respectively, to use the new local tick resume/suspend functions. Signed-off-by: Thomas Gleixner [ Combined two patches, rebased, modified subject/changelog. ] Signed-off-by: Rafael J. Wysocki Cc: Boris Ostrovsky Cc: David Vrabel Cc: Konrad Rzeszutek Wilk Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1698741.eezk9tnXtG@vostro.rjw.lan [ Merged to latest timers/core. ] Signed-off-by: Ingo Molnar --- arch/x86/xen/suspend.c | 2 +- include/linux/tick.h | 6 ++--- kernel/time/tick-broadcast.c | 24 +++++++++++++------ kernel/time/tick-common.c | 55 ++++++++++++++++++++++++++++++-------------- kernel/time/tick-internal.h | 8 +++++-- 5 files changed, 65 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 033e428581f4..d9497698645a 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -85,7 +85,7 @@ static void xen_vcpu_notify_restore(void *data) if (smp_processor_id() == 0) return; - tick_resume(); + tick_resume_local(); } void xen_arch_resume(void) diff --git a/include/linux/tick.h b/include/linux/tick.h index 7e07e0e3d898..a3d4d2840e7f 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -29,13 +29,13 @@ extern struct tick_device *tick_get_device(int cpu); extern void __init tick_init(void); extern void tick_freeze(void); extern void tick_unfreeze(void); -/* Should be core only, but XEN resume magic abuses this interface */ -extern void tick_resume(void); +/* Should be core only, but XEN resume magic requires this */ +extern void tick_resume_local(void); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_freeze(void) { } static inline void tick_unfreeze(void) { } -static inline void tick_resume(void) { } +static inline void tick_resume_local(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #ifdef CONFIG_TICK_ONESHOT diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 60e6c23ce1c7..19cfb381faa9 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -455,11 +455,26 @@ void tick_suspend_broadcast(void) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -int tick_resume_broadcast(void) +/* + * This is called from tick_resume_local() on a resuming CPU. That's + * called from the core resume function, tick_unfreeze() and the magic XEN + * resume hackery. + * + * In none of these cases the broadcast device mode can change and the + * bit of the resuming CPU in the broadcast mask is safe as well. + */ +bool tick_resume_check_broadcast(void) +{ + if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT) + return false; + else + return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask); +} + +void tick_resume_broadcast(void) { struct clock_event_device *bc; unsigned long flags; - int broadcast = 0; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -472,8 +487,6 @@ int tick_resume_broadcast(void) case TICKDEV_MODE_PERIODIC: if (!cpumask_empty(tick_broadcast_mask)) tick_broadcast_start_periodic(bc); - broadcast = cpumask_test_cpu(smp_processor_id(), - tick_broadcast_mask); break; case TICKDEV_MODE_ONESHOT: if (!cpumask_empty(tick_broadcast_mask)) @@ -482,11 +495,8 @@ int tick_resume_broadcast(void) } } raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); - - return broadcast; } - #ifdef CONFIG_TICK_ONESHOT static cpumask_var_t tick_broadcast_oneshot_mask; diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 1a60c2ae96a8..da796d65d1fb 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -374,40 +374,32 @@ void tick_shutdown(unsigned int *cpup) } /** - * tick_suspend - Suspend the tick and the broadcast device + * tick_suspend_local - Suspend the local tick device * - * Called from syscore_suspend() via timekeeping_suspend with only one - * CPU online and interrupts disabled or from tick_unfreeze() under - * tick_freeze_lock. + * Called from the local cpu for freeze with interrupts disabled. * * No locks required. Nothing can change the per cpu device. */ -void tick_suspend(void) +static void tick_suspend_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); clockevents_shutdown(td->evtdev); - tick_suspend_broadcast(); } /** - * tick_resume - Resume the tick and the broadcast device + * tick_resume_local - Resume the local tick device * - * Called from syscore_resume() via timekeeping_resume with only one - * CPU online and interrupts disabled or from tick_unfreeze() under - * tick_freeze_lock. + * Called from the local CPU for unfreeze or XEN resume magic. * * No locks required. Nothing can change the per cpu device. */ -void tick_resume(void) +void tick_resume_local(void) { - struct tick_device *td; - int broadcast; + struct tick_device *td = this_cpu_ptr(&tick_cpu_device); + bool broadcast = tick_resume_check_broadcast(); - broadcast = tick_resume_broadcast(); - td = this_cpu_ptr(&tick_cpu_device); clockevents_tick_resume(td->evtdev); - if (!broadcast) { if (td->mode == TICKDEV_MODE_PERIODIC) tick_setup_periodic(td->evtdev, 0); @@ -416,6 +408,35 @@ void tick_resume(void) } } +/** + * tick_suspend - Suspend the tick and the broadcast device + * + * Called from syscore_suspend() via timekeeping_suspend with only one + * CPU online and interrupts disabled or from tick_unfreeze() under + * tick_freeze_lock. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_suspend(void) +{ + tick_suspend_local(); + tick_suspend_broadcast(); +} + +/** + * tick_resume - Resume the tick and the broadcast device + * + * Called from syscore_resume() via timekeeping_resume with only one + * CPU online and interrupts disabled. + * + * No locks required. Nothing can change the per cpu device. + */ +void tick_resume(void) +{ + tick_resume_broadcast(); + tick_resume_local(); +} + static DEFINE_RAW_SPINLOCK(tick_freeze_lock); static unsigned int tick_freeze_depth; @@ -436,7 +457,7 @@ void tick_freeze(void) if (tick_freeze_depth == num_online_cpus()) { timekeeping_suspend(); } else { - tick_suspend(); + tick_suspend_local(); } raw_spin_unlock(&tick_freeze_lock); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 5c9f0eec56b2..6ba7bce732f2 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -23,6 +23,7 @@ extern void tick_check_new_device(struct clock_event_device *dev); extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); +extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); @@ -43,6 +44,7 @@ extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); #else static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } #endif /* GENERIC_CLOCKEVENTS */ /* Oneshot related functions */ @@ -81,7 +83,8 @@ extern int tick_is_broadcast_device(struct clock_event_device *dev); extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); -extern int tick_resume_broadcast(void); +extern void tick_resume_broadcast(void); +extern bool tick_resume_check_broadcast(void); extern void tick_broadcast_init(void); extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast); extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); @@ -95,7 +98,8 @@ static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } -static inline int tick_resume_broadcast(void) { return 0; } +static inline void tick_resume_broadcast(void) { } +static inline bool tick_resume_check_broadcast(void) { return false; } static inline void tick_broadcast_init(void) { } static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; } -- cgit v1.3-8-gc7d7 From 7270d11c56f594af4d166b2988421cd8ed933dc1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 25 Mar 2015 13:11:52 +0100 Subject: arm/bL_switcher: Kill tick suspend hackery Use the new tick_suspend/resume_local() and get rid of the homebrewn implementation of these in the ARM bL switcher. The check for the cpumask is completely pointless. There is no harm to suspend a per cpu tick device unconditionally. If that's a real issue then we fix it proper at the core level and not with some completely undocumented hacks in some random core code. Move the tick internals to the core code, now that this nuisance is gone. Signed-off-by: Thomas Gleixner [ rjw: Rebase, changelog ] Signed-off-by: Rafael J. Wysocki Cc: Nicolas Pitre Cc: Peter Zijlstra Cc: Russell King Link: http://lkml.kernel.org/r/1655112.Ws17YsMfN7@vostro.rjw.lan Signed-off-by: Ingo Molnar --- arch/arm/common/bL_switcher.c | 16 ++-------------- include/linux/clockchips.h | 6 ------ include/linux/tick.h | 19 ++++--------------- kernel/time/tick-common.c | 2 +- kernel/time/tick-internal.h | 5 +++++ kernel/time/tick-sched.h | 10 ++++++++++ 6 files changed, 22 insertions(+), 36 deletions(-) (limited to 'kernel') diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index d4f970a4d255..37dc0fe1093f 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -151,8 +151,6 @@ static int bL_switch_to(unsigned int new_cluster_id) unsigned int mpidr, this_cpu, that_cpu; unsigned int ob_mpidr, ob_cpu, ob_cluster, ib_mpidr, ib_cpu, ib_cluster; struct completion inbound_alive; - struct tick_device *tdev; - enum clock_event_state tdev_state; long volatile *handshake_ptr; int ipi_nr, ret; @@ -219,13 +217,7 @@ static int bL_switch_to(unsigned int new_cluster_id) /* redirect GIC's SGIs to our counterpart */ gic_migrate_target(bL_gic_id[ib_cpu][ib_cluster]); - tdev = tick_get_device(this_cpu); - if (tdev && !cpumask_equal(tdev->evtdev->cpumask, cpumask_of(this_cpu))) - tdev = NULL; - if (tdev) { - tdev_state = tdev->evtdev->state; - clockevents_set_state(tdev->evtdev, CLOCK_EVT_STATE_SHUTDOWN); - } + tick_suspend_local(); ret = cpu_pm_enter(); @@ -251,11 +243,7 @@ static int bL_switch_to(unsigned int new_cluster_id) ret = cpu_pm_exit(); - if (tdev) { - clockevents_set_state(tdev->evtdev, tdev_state); - clockevents_program_event(tdev->evtdev, - tdev->evtdev->next_event, 1); - } + tick_resume_local(); trace_cpu_migrate_finish(ktime_get_real_ns(), ib_mpidr); local_fiq_enable(); diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 50ce9750754f..3ac7e2d90374 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -198,12 +198,6 @@ clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 minsec) freq, minsec); } -/* Should be core only, but is abused by arm bl_switcher */ -extern void clockevents_set_state(struct clock_event_device *dev, - enum clock_event_state state); -extern int clockevents_program_event(struct clock_event_device *dev, - ktime_t expires, bool force); - extern void clockevents_suspend(void); extern void clockevents_resume(void); diff --git a/include/linux/tick.h b/include/linux/tick.h index a3d4d2840e7f..589868b09aff 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -11,30 +11,19 @@ #include #include -/* ARM BL switcher abuse support */ -#ifdef CONFIG_GENERIC_CLOCKEVENTS -enum tick_device_mode { - TICKDEV_MODE_PERIODIC, - TICKDEV_MODE_ONESHOT, -}; - -struct tick_device { - struct clock_event_device *evtdev; - enum tick_device_mode mode; -}; -extern struct tick_device *tick_get_device(int cpu); -#endif - #ifdef CONFIG_GENERIC_CLOCKEVENTS extern void __init tick_init(void); extern void tick_freeze(void); extern void tick_unfreeze(void); -/* Should be core only, but XEN resume magic requires this */ +/* Should be core only, but ARM BL switcher requires it */ +extern void tick_suspend_local(void); +/* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_freeze(void) { } static inline void tick_unfreeze(void) { } +static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index da796d65d1fb..e28ba5c044c5 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -380,7 +380,7 @@ void tick_shutdown(unsigned int *cpup) * * No locks required. Nothing can change the per cpu device. */ -static void tick_suspend_local(void) +void tick_suspend_local(void) { struct tick_device *td = this_cpu_ptr(&tick_cpu_device); diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 6ba7bce732f2..5fc2dafabd58 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -28,6 +28,7 @@ extern bool tick_check_replacement(struct clock_event_device *curdev, struct clock_event_device *newdev); extern void tick_install_replacement(struct clock_event_device *dev); extern int tick_is_oneshot_available(void); +extern struct tick_device *tick_get_device(int cpu); extern int clockevents_tick_resume(struct clock_event_device *dev); /* Check, if the device is functional or a dummy for broadcast */ @@ -39,6 +40,10 @@ static inline int tick_device_is_functional(struct clock_event_device *dev) extern void clockevents_shutdown(struct clock_event_device *dev); extern void clockevents_exchange_device(struct clock_event_device *old, struct clock_event_device *new); +extern void clockevents_set_state(struct clock_event_device *dev, + enum clock_event_state state); +extern int clockevents_program_event(struct clock_event_device *dev, + ktime_t expires, bool force); extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index 930743249127..28b5da3e1a17 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -3,6 +3,16 @@ #include +enum tick_device_mode { + TICKDEV_MODE_PERIODIC, + TICKDEV_MODE_ONESHOT, +}; + +struct tick_device { + struct clock_event_device *evtdev; + enum tick_device_mode mode; +}; + enum tick_nohz_mode { NOHZ_MODE_INACTIVE, NOHZ_MODE_LOWRES, -- cgit v1.3-8-gc7d7 From 3ae7a939165c6159afb3c09e1d7405b6d1807f2b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Apr 2015 11:26:06 +0200 Subject: tick: Further simplify tick-internal.h Move the broadcasting related section to the GENERIC_CLOCKEVENTS=y section - this also solves build failures on architectures that don't use generic clockevents yet. Also standardize include file style to make it easier to read, and use nesting depth aware preprocessor directives to make future merges easier. Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-internal.h | 79 +++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 5fc2dafabd58..b6ba0a44e740 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -9,8 +9,8 @@ #ifdef CONFIG_GENERIC_CLOCKEVENTS -#define TICK_DO_TIMER_NONE -1 -#define TICK_DO_TIMER_BOOT -2 +# define TICK_DO_TIMER_NONE -1 +# define TICK_DO_TIMER_BOOT -2 DECLARE_PER_CPU(struct tick_device, tick_cpu_device); extern ktime_t tick_next_period; @@ -47,41 +47,9 @@ extern int clockevents_program_event(struct clock_event_device *dev, extern void clockevents_handle_noop(struct clock_event_device *dev); extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq); extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); -#else -static inline void tick_suspend(void) { } -static inline void tick_resume(void) { } -#endif /* GENERIC_CLOCKEVENTS */ - -/* Oneshot related functions */ -#ifdef CONFIG_TICK_ONESHOT -extern void tick_setup_oneshot(struct clock_event_device *newdev, - void (*handler)(struct clock_event_device *), - ktime_t nextevt); -extern int tick_program_event(ktime_t expires, int force); -extern void tick_oneshot_notify(void); -extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); -extern void tick_resume_oneshot(void); -static inline bool tick_oneshot_possible(void) { return true; } -extern int tick_oneshot_mode_active(void); -extern void tick_clock_notify(void); -extern int tick_check_oneshot_change(int allow_nohz); -extern int tick_init_highres(void); -#else /* !ONESHOT */ -static inline -void tick_setup_oneshot(struct clock_event_device *newdev, - void (*handler)(struct clock_event_device *), - ktime_t nextevt) { BUG(); } -static inline void tick_resume_oneshot(void) { BUG(); } -static inline int tick_program_event(ktime_t expires, int force) { return 0; } -static inline void tick_oneshot_notify(void) { } -static inline bool tick_oneshot_possible(void) { return false; } -static inline int tick_oneshot_mode_active(void) { return 0; } -static inline void tick_clock_notify(void) { } -static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } -#endif /* !TICK_ONESHOT */ /* Broadcasting support */ -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +# ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); @@ -95,7 +63,7 @@ extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadc extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq); extern struct tick_device *tick_get_broadcast_device(void); extern struct cpumask *tick_get_broadcast_mask(void); -#else /* !BROADCAST */ +# else /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST: */ static inline void tick_install_broadcast_device(struct clock_event_device *dev) { } static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } @@ -113,7 +81,40 @@ static inline void tick_set_periodic_handler(struct clock_event_device *dev, int { dev->event_handler = tick_handle_periodic; } -#endif /* !BROADCAST */ +# endif /* !CONFIG_GENERIC_CLOCKEVENTS_BROADCAST */ + +#else /* !GENERIC_CLOCKEVENTS: */ +static inline void tick_suspend(void) { } +static inline void tick_resume(void) { } +#endif /* !GENERIC_CLOCKEVENTS */ + +/* Oneshot related functions */ +#ifdef CONFIG_TICK_ONESHOT +extern void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt); +extern int tick_program_event(ktime_t expires, int force); +extern void tick_oneshot_notify(void); +extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); +extern void tick_resume_oneshot(void); +static inline bool tick_oneshot_possible(void) { return true; } +extern int tick_oneshot_mode_active(void); +extern void tick_clock_notify(void); +extern int tick_check_oneshot_change(int allow_nohz); +extern int tick_init_highres(void); +#else /* !CONFIG_TICK_ONESHOT: */ +static inline +void tick_setup_oneshot(struct clock_event_device *newdev, + void (*handler)(struct clock_event_device *), + ktime_t nextevt) { BUG(); } +static inline void tick_resume_oneshot(void) { BUG(); } +static inline int tick_program_event(ktime_t expires, int force) { return 0; } +static inline void tick_oneshot_notify(void) { } +static inline bool tick_oneshot_possible(void) { return false; } +static inline int tick_oneshot_mode_active(void) { return 0; } +static inline void tick_clock_notify(void) { } +static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } +#endif /* !CONFIG_TICK_ONESHOT */ /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) @@ -125,7 +126,7 @@ extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); -#else /* BROADCAST && ONESHOT */ +#else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } @@ -133,7 +134,7 @@ static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } -#endif /* !BROADCAST && ONESHOT */ +#endif /* !(BROADCAST && ONESHOT) */ /* NO_HZ_FULL internal */ #ifdef CONFIG_NO_HZ_FULL -- cgit v1.3-8-gc7d7 From 72cbbc8994242b5b43753738c01bf07bf29cb70d Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:19 -0700 Subject: tracing: Add kprobe flag add TRACE_EVENT_FL_KPROBE flag to differentiate kprobe type of tracepoints, since bpf programs can only be attached to kprobe type of PERF_TYPE_TRACEPOINT perf events. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-3-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 3 +++ kernel/trace/trace_kprobe.c | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index c674ee8f7fca..77325e1a1816 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -252,6 +252,7 @@ enum { TRACE_EVENT_FL_WAS_ENABLED_BIT, TRACE_EVENT_FL_USE_CALL_FILTER_BIT, TRACE_EVENT_FL_TRACEPOINT_BIT, + TRACE_EVENT_FL_KPROBE_BIT, }; /* @@ -265,6 +266,7 @@ enum { * it is best to clear the buffers that used it). * USE_CALL_FILTER - For ftrace internal events, don't use file filter * TRACEPOINT - Event is a tracepoint + * KPROBE - Event is a kprobe */ enum { TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), @@ -274,6 +276,7 @@ enum { TRACE_EVENT_FL_WAS_ENABLED = (1 << TRACE_EVENT_FL_WAS_ENABLED_BIT), TRACE_EVENT_FL_USE_CALL_FILTER = (1 << TRACE_EVENT_FL_USE_CALL_FILTER_BIT), TRACE_EVENT_FL_TRACEPOINT = (1 << TRACE_EVENT_FL_TRACEPOINT_BIT), + TRACE_EVENT_FL_KPROBE = (1 << TRACE_EVENT_FL_KPROBE_BIT), }; struct ftrace_event_call { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index d73f565b4e06..8fa549f6f528 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1286,7 +1286,7 @@ static int register_kprobe_event(struct trace_kprobe *tk) kfree(call->print_fmt); return -ENODEV; } - call->flags = 0; + call->flags = TRACE_EVENT_FL_KPROBE; call->class->reg = kprobe_register; call->data = tk; ret = trace_add_event_call(call); -- cgit v1.3-8-gc7d7 From 2541517c32be2531e0da59dfd7efc1ce844644f5 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:20 -0700 Subject: tracing, perf: Implement BPF programs attached to kprobes BPF programs, attached to kprobes, provide a safe way to execute user-defined BPF byte-code programs without being able to crash or hang the kernel in any way. The BPF engine makes sure that such programs have a finite execution time and that they cannot break out of their sandbox. The user interface is to attach to a kprobe via the perf syscall: struct perf_event_attr attr = { .type = PERF_TYPE_TRACEPOINT, .config = event_id, ... }; event_fd = perf_event_open(&attr,...); ioctl(event_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); 'prog_fd' is a file descriptor associated with BPF program previously loaded. 'event_id' is an ID of the kprobe created. Closing 'event_fd': close(event_fd); ... automatically detaches BPF program from it. BPF programs can call in-kernel helper functions to: - lookup/update/delete elements in maps - probe_read - wraper of probe_kernel_read() used to access any kernel data structures BPF programs receive 'struct pt_regs *' as an input ('struct pt_regs' is architecture dependent) and return 0 to ignore the event and 1 to store kprobe event into the ring buffer. Note, kprobes are a fundamentally _not_ a stable kernel ABI, so BPF programs attached to kprobes must be recompiled for every kernel version and user must supply correct LINUX_VERSION_CODE in attr.kern_version during bpf_prog_load() call. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Reviewed-by: Masami Hiramatsu Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-4-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 11 ++++ include/uapi/linux/bpf.h | 3 + include/uapi/linux/perf_event.h | 1 + kernel/bpf/syscall.c | 7 ++- kernel/events/core.c | 59 ++++++++++++++++++ kernel/trace/Makefile | 1 + kernel/trace/bpf_trace.c | 130 ++++++++++++++++++++++++++++++++++++++++ kernel/trace/trace_kprobe.c | 8 +++ 8 files changed, 219 insertions(+), 1 deletion(-) create mode 100644 kernel/trace/bpf_trace.c (limited to 'kernel') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 77325e1a1816..0aa535bc9f05 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -13,6 +13,7 @@ struct trace_array; struct trace_buffer; struct tracer; struct dentry; +struct bpf_prog; struct trace_print_flags { unsigned long mask; @@ -306,6 +307,7 @@ struct ftrace_event_call { #ifdef CONFIG_PERF_EVENTS int perf_refcount; struct hlist_head __percpu *perf_events; + struct bpf_prog *prog; int (*perf_perm)(struct ftrace_event_call *, struct perf_event *); @@ -551,6 +553,15 @@ event_trigger_unlock_commit_regs(struct ftrace_event_file *file, event_triggers_post_call(file, tt); } +#ifdef CONFIG_BPF_SYSCALL +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx); +#else +static inline unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + return 1; +} +#endif + enum { FILTER_OTHER = 0, FILTER_STATIC_STRING, diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 45da7ec7d274..b2948feeb70b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -118,6 +118,7 @@ enum bpf_map_type { enum bpf_prog_type { BPF_PROG_TYPE_UNSPEC, BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, }; /* flags for BPF_MAP_UPDATE_ELEM command */ @@ -151,6 +152,7 @@ union bpf_attr { __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* checked when prog_type=kprobe */ }; } __attribute__((aligned(8))); @@ -162,6 +164,7 @@ enum bpf_func_id { BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */ BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ + BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ __BPF_FUNC_MAX_ID, }; diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 3bb40ddadbe5..91803e54ee73 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -381,6 +381,7 @@ struct perf_event_attr { #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) #define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) #define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) enum perf_event_ioc_flags { PERF_IOC_FLAG_GROUP = 1U << 0, diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 536edc2be307..504c10b990ef 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -16,6 +16,7 @@ #include #include #include +#include static LIST_HEAD(bpf_map_types); @@ -467,7 +468,7 @@ struct bpf_prog *bpf_prog_get(u32 ufd) } /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD log_buf +#define BPF_PROG_LOAD_LAST_FIELD kern_version static int bpf_prog_load(union bpf_attr *attr) { @@ -492,6 +493,10 @@ static int bpf_prog_load(union bpf_attr *attr) if (attr->insn_cnt >= BPF_MAXINSNS) return -EINVAL; + if (type == BPF_PROG_TYPE_KPROBE && + attr->kern_version != LINUX_VERSION_CODE) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) diff --git a/kernel/events/core.c b/kernel/events/core.c index c40c2cac2d8e..5c13862d3e85 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -42,6 +42,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -3407,6 +3409,7 @@ errout: } static void perf_event_free_filter(struct perf_event *event); +static void perf_event_free_bpf_prog(struct perf_event *event); static void free_event_rcu(struct rcu_head *head) { @@ -3416,6 +3419,7 @@ static void free_event_rcu(struct rcu_head *head) if (event->ns) put_pid_ns(event->ns); perf_event_free_filter(event); + perf_event_free_bpf_prog(event); kfree(event); } @@ -3928,6 +3932,7 @@ static inline int perf_fget_light(int fd, struct fd *p) static int perf_event_set_output(struct perf_event *event, struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd); static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) { @@ -3981,6 +3986,9 @@ static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned lon case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); + case PERF_EVENT_IOC_SET_BPF: + return perf_event_set_bpf_prog(event, arg); + default: return -ENOTTY; } @@ -6455,6 +6463,49 @@ static void perf_event_free_filter(struct perf_event *event) ftrace_profile_free_filter(event); } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + struct bpf_prog *prog; + + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -EINVAL; + + if (event->tp_event->prog) + return -EEXIST; + + if (!(event->tp_event->flags & TRACE_EVENT_FL_KPROBE)) + /* bpf programs can only be attached to kprobes */ + return -EINVAL; + + prog = bpf_prog_get(prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->prog_type != BPF_PROG_TYPE_KPROBE) { + /* valid fd, but invalid bpf program type */ + bpf_prog_put(prog); + return -EINVAL; + } + + event->tp_event->prog = prog; + + return 0; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ + struct bpf_prog *prog; + + if (!event->tp_event) + return; + + prog = event->tp_event->prog; + if (prog) { + event->tp_event->prog = NULL; + bpf_prog_put(prog); + } +} + #else static inline void perf_tp_register(void) @@ -6470,6 +6521,14 @@ static void perf_event_free_filter(struct perf_event *event) { } +static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd) +{ + return -ENOENT; +} + +static void perf_event_free_bpf_prog(struct perf_event *event) +{ +} #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 98f26588255e..c575a300103b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o +obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c new file mode 100644 index 000000000000..f1e87da91da3 --- /dev/null +++ b/kernel/trace/bpf_trace.c @@ -0,0 +1,130 @@ +/* Copyright (c) 2011-2015 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include "trace.h" + +static DEFINE_PER_CPU(int, bpf_prog_active); + +/** + * trace_call_bpf - invoke BPF program + * @prog: BPF program + * @ctx: opaque context pointer + * + * kprobe handlers execute BPF programs via this helper. + * Can be used from static tracepoints in the future. + * + * Return: BPF programs always return an integer which is interpreted by + * kprobe handler as: + * 0 - return from kprobe (event is filtered out) + * 1 - store kprobe event into ring buffer + * Other values are reserved and currently alias to 1 + */ +unsigned int trace_call_bpf(struct bpf_prog *prog, void *ctx) +{ + unsigned int ret; + + if (in_nmi()) /* not supported yet */ + return 1; + + preempt_disable(); + + if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1)) { + /* + * since some bpf program is already running on this cpu, + * don't call into another bpf program (same or different) + * and don't send kprobe event into ring-buffer, + * so return zero here + */ + ret = 0; + goto out; + } + + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + + out: + __this_cpu_dec(bpf_prog_active); + preempt_enable(); + + return ret; +} +EXPORT_SYMBOL_GPL(trace_call_bpf); + +static u64 bpf_probe_read(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + void *dst = (void *) (long) r1; + int size = (int) r2; + void *unsafe_ptr = (void *) (long) r3; + + return probe_kernel_read(dst, unsafe_ptr, size); +} + +static const struct bpf_func_proto bpf_probe_read_proto = { + .func = bpf_probe_read, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +{ + switch (func_id) { + case BPF_FUNC_map_lookup_elem: + return &bpf_map_lookup_elem_proto; + case BPF_FUNC_map_update_elem: + return &bpf_map_update_elem_proto; + case BPF_FUNC_map_delete_elem: + return &bpf_map_delete_elem_proto; + case BPF_FUNC_probe_read: + return &bpf_probe_read_proto; + default: + return NULL; + } +} + +/* bpf+kprobe programs can access fields of 'struct pt_regs' */ +static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type) +{ + /* check bounds */ + if (off < 0 || off >= sizeof(struct pt_regs)) + return false; + + /* only read is allowed */ + if (type != BPF_READ) + return false; + + /* disallow misaligned access */ + if (off % size != 0) + return false; + + return true; +} + +static struct bpf_verifier_ops kprobe_prog_ops = { + .get_func_proto = kprobe_prog_func_proto, + .is_valid_access = kprobe_prog_is_valid_access, +}; + +static struct bpf_prog_type_list kprobe_tl = { + .ops = &kprobe_prog_ops, + .type = BPF_PROG_TYPE_KPROBE, +}; + +static int __init register_kprobe_prog_ops(void) +{ + bpf_register_prog_type(&kprobe_tl); + return 0; +} +late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 8fa549f6f528..dc3462507d7c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1134,11 +1134,15 @@ static void kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; @@ -1165,11 +1169,15 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri, struct pt_regs *regs) { struct ftrace_event_call *call = &tk->tp.call; + struct bpf_prog *prog = call->prog; struct kretprobe_trace_entry_head *entry; struct hlist_head *head; int size, __size, dsize; int rctx; + if (prog && !trace_call_bpf(prog, regs)) + return; + head = this_cpu_ptr(call->perf_events); if (hlist_empty(head)) return; -- cgit v1.3-8-gc7d7 From d9847d310ab4003725e6ed1822682e24bd406908 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:21 -0700 Subject: tracing: Allow BPF programs to call bpf_ktime_get_ns() bpf_ktime_get_ns() is used by programs to compute time delta between events or as a timestamp Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-5-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index b2948feeb70b..238c6883877b 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -165,6 +165,7 @@ enum bpf_func_id { BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */ BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ + BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index f1e87da91da3..8f5787294971 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -78,6 +78,18 @@ static const struct bpf_func_proto bpf_probe_read_proto = { .arg3_type = ARG_ANYTHING, }; +static u64 bpf_ktime_get_ns(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5) +{ + /* NMI safe access to clock monotonic */ + return ktime_get_mono_fast_ns(); +} + +static const struct bpf_func_proto bpf_ktime_get_ns_proto = { + .func = bpf_ktime_get_ns, + .gpl_only = true, + .ret_type = RET_INTEGER, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -89,6 +101,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_map_delete_elem_proto; case BPF_FUNC_probe_read: return &bpf_probe_read_proto; + case BPF_FUNC_ktime_get_ns: + return &bpf_ktime_get_ns_proto; default: return NULL; } -- cgit v1.3-8-gc7d7 From 9c959c863f8217a2ff3d7c296e8223654d240569 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Wed, 25 Mar 2015 12:49:22 -0700 Subject: tracing: Allow BPF programs to call bpf_trace_printk() Debugging of BPF programs needs some form of printk from the program, so let programs call limited trace_printk() with %d %u %x %p modifiers only. Similar to kernel modules, during program load verifier checks whether program is calling bpf_trace_printk() and if so, kernel allocates trace_printk buffers and emits big 'this is debug only' banner. Signed-off-by: Alexei Starovoitov Reviewed-by: Steven Rostedt Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1427312966-8434-6-git-send-email-ast@plumgrid.com Signed-off-by: Ingo Molnar --- include/uapi/linux/bpf.h | 1 + kernel/trace/bpf_trace.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 238c6883877b..cc47ef41076a 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -166,6 +166,7 @@ enum bpf_func_id { BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */ BPF_FUNC_probe_read, /* int bpf_probe_read(void *dst, int size, void *src) */ BPF_FUNC_ktime_get_ns, /* u64 bpf_ktime_get_ns(void) */ + BPF_FUNC_trace_printk, /* int bpf_trace_printk(const char *fmt, int fmt_size, ...) */ __BPF_FUNC_MAX_ID, }; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 8f5787294971..2d56ce501632 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "trace.h" static DEFINE_PER_CPU(int, bpf_prog_active); @@ -90,6 +91,74 @@ static const struct bpf_func_proto bpf_ktime_get_ns_proto = { .ret_type = RET_INTEGER, }; +/* + * limited trace_printk() + * only %d %u %x %ld %lu %lx %lld %llu %llx %p conversion specifiers allowed + */ +static u64 bpf_trace_printk(u64 r1, u64 fmt_size, u64 r3, u64 r4, u64 r5) +{ + char *fmt = (char *) (long) r1; + int mod[3] = {}; + int fmt_cnt = 0; + int i; + + /* + * bpf_check()->check_func_arg()->check_stack_boundary() + * guarantees that fmt points to bpf program stack, + * fmt_size bytes of it were initialized and fmt_size > 0 + */ + if (fmt[--fmt_size] != 0) + return -EINVAL; + + /* check format string for allowed specifiers */ + for (i = 0; i < fmt_size; i++) { + if ((!isprint(fmt[i]) && !isspace(fmt[i])) || !isascii(fmt[i])) + return -EINVAL; + + if (fmt[i] != '%') + continue; + + if (fmt_cnt >= 3) + return -EINVAL; + + /* fmt[i] != 0 && fmt[last] == 0, so we can access fmt[i + 1] */ + i++; + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } else if (fmt[i] == 'p') { + mod[fmt_cnt]++; + i++; + if (!isspace(fmt[i]) && !ispunct(fmt[i]) && fmt[i] != 0) + return -EINVAL; + fmt_cnt++; + continue; + } + + if (fmt[i] == 'l') { + mod[fmt_cnt]++; + i++; + } + + if (fmt[i] != 'd' && fmt[i] != 'u' && fmt[i] != 'x') + return -EINVAL; + fmt_cnt++; + } + + return __trace_printk(1/* fake ip will not be printed */, fmt, + mod[0] == 2 ? r3 : mod[0] == 1 ? (long) r3 : (u32) r3, + mod[1] == 2 ? r4 : mod[1] == 1 ? (long) r4 : (u32) r4, + mod[2] == 2 ? r5 : mod[2] == 1 ? (long) r5 : (u32) r5); +} + +static const struct bpf_func_proto bpf_trace_printk_proto = { + .func = bpf_trace_printk, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_STACK, + .arg2_type = ARG_CONST_STACK_SIZE, +}; + static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) { switch (func_id) { @@ -103,6 +172,15 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_probe_read_proto; case BPF_FUNC_ktime_get_ns: return &bpf_ktime_get_ns_proto; + + case BPF_FUNC_trace_printk: + /* + * this program might be calling bpf_trace_printk, + * so allocate per-cpu printk buffers + */ + trace_printk_init_buffers(); + + return &bpf_trace_printk_proto; default: return NULL; } -- cgit v1.3-8-gc7d7 From 345527b1edce8df719e0884500c76832a18211c3 Mon Sep 17 00:00:00 2001 From: Preeti U Murthy Date: Mon, 30 Mar 2015 14:59:19 +0530 Subject: clockevents: Fix cpu_down() race for hrtimer based broadcasting It was found when doing a hotplug stress test on POWER, that the machine either hit softlockups or rcu_sched stall warnings. The issue was traced to commit: 7cba160ad789 ("powernv/cpuidle: Redesign idle states management") which exposed the cpu_down() race with hrtimer based broadcast mode: 5d1638acb9f6 ("tick: Introduce hrtimer based broadcast") The race is the following: Assume CPU1 is the CPU which holds the hrtimer broadcasting duty before it is taken down. CPU0 CPU1 cpu_down() take_cpu_down() disable_interrupts() cpu_die() while (CPU1 != CPU_DEAD) { msleep(100); switch_to_idle(); stop_cpu_timer(); schedule_broadcast(); } tick_cleanup_cpu_dead() take_over_broadcast() So after CPU1 disabled interrupts it cannot handle the broadcast hrtimer anymore, so CPU0 will be stuck forever. Fix this by explicitly taking over broadcast duty before cpu_die(). This is a temporary workaround. What we really want is a callback in the clockevent device which allows us to do that from the dying CPU by pushing the hrtimer onto a different cpu. That might involve an IPI and is definitely more complex than this immediate fix. Changelog was picked up from: https://lkml.org/lkml/2015/2/16/213 Suggested-by: Thomas Gleixner Tested-by: Nicolas Pitre Signed-off-by: Preeti U. Murthy Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: nicolas.pitre@linaro.org Cc: peterz@infradead.org Cc: rjw@rjwysocki.net Fixes: http://linuxppc.10917.n7.nabble.com/offlining-cpus-breakage-td88619.html Link: http://lkml.kernel.org/r/20150330092410.24979.59887.stgit@preeti.in.ibm.com [ Merged it to the latest timer tree, renamed the callback, tidied up the changelog. ] Signed-off-by: Ingo Molnar --- include/linux/tick.h | 6 ++++++ kernel/cpu.c | 2 ++ kernel/time/tick-broadcast.c | 19 +++++++++++-------- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 589868b09aff..f9ff225d53c0 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -36,6 +36,12 @@ extern void tick_irq_enter(void); static inline void tick_irq_enter(void) { } #endif +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern void hotplug_cpu__broadcast_tick_pull(int dead_cpu); +#else +static inline void hotplug_cpu__broadcast_tick_pull(int dead_cpu) { } +#endif + #ifdef CONFIG_NO_HZ_COMMON extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); diff --git a/kernel/cpu.c b/kernel/cpu.c index 1972b161c61e..af5db20e5803 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include "smpboot.h" @@ -411,6 +412,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) while (!idle_cpu(cpu)) cpu_relax(); + hotplug_cpu__broadcast_tick_pull(cpu); /* This actually kills the CPU. */ __cpu_die(cpu); diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 19cfb381faa9..f5e0fd5652dc 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -680,14 +680,19 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -static void broadcast_move_bc(int deadcpu) +void hotplug_cpu__broadcast_tick_pull(int deadcpu) { - struct clock_event_device *bc = tick_broadcast_device.evtdev; + struct clock_event_device *bc; + unsigned long flags; - if (!bc || !broadcast_needs_cpu(bc, deadcpu)) - return; - /* This moves the broadcast assignment to this cpu */ - clockevents_program_event(bc, bc->next_event, 1); + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } /* @@ -924,8 +929,6 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); cpumask_clear_cpu(cpu, tick_broadcast_force_mask); - broadcast_move_bc(cpu); - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.3-8-gc7d7 From e1abf2cc8d5d80b41c4419368ec743ccadbb131e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 2 Apr 2015 15:51:39 +0200 Subject: bpf: Fix the build on BPF_SYSCALL=y && !CONFIG_TRACING kernels, make it more configurable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So bpf_tracing.o depends on CONFIG_BPF_SYSCALL - but that's not its only dependency, it also depends on the tracing infrastructure and on kprobes, without which it will fail to build with: In file included from kernel/trace/bpf_trace.c:14:0: kernel/trace/trace.h: In function ‘trace_test_and_set_recursion’: kernel/trace/trace.h:491:28: error: ‘struct task_struct’ has no member named ‘trace_recursion’ unsigned int val = current->trace_recursion; [...] It took quite some time to trigger this build failure, because right now BPF_SYSCALL is very obscure, depends on CONFIG_EXPERT. So also make BPF_SYSCALL more configurable, not just under CONFIG_EXPERT. If BPF_SYSCALL, tracing and kprobes are enabled then enable the bpf_tracing gateway as well. We might want to make this an interactive option later on, although I'd not complicate it unnecessarily: enabling BPF_SYSCALL is enough of an indicator that the user wants BPF support. Cc: Alexei Starovoitov Cc: Andrew Morton Cc: Arnaldo Carvalho de Melo Cc: Arnaldo Carvalho de Melo Cc: Daniel Borkmann Cc: David S. Miller Cc: Jiri Olsa Cc: Linus Torvalds Cc: Masami Hiramatsu Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Peter Zijlstra Cc: Steven Rostedt Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- kernel/trace/Kconfig | 8 ++++++++ kernel/trace/Makefile | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/init/Kconfig b/init/Kconfig index f5dbc6d4261b..2b4d055aca4a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1513,7 +1513,7 @@ config EVENTFD # syscall, maps, verifier config BPF_SYSCALL - bool "Enable bpf() system call" if EXPERT + bool "Enable bpf() system call" select ANON_INODES select BPF default n diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a5da09c899dd..c8e53c051293 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -432,6 +432,14 @@ config UPROBE_EVENT This option is required if you plan to use perf-probe subcommand of perf tools on user space applications. +config BPF_EVENTS + depends on BPF_SYSCALL + depends on KPROBE_EVENT + bool + default y + help + This allows the user to attach BPF programs to kprobe events. + config PROBE_EVENTS def_bool n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index c575a300103b..9b1044e936a6 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -53,7 +53,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o -obj-$(CONFIG_BPF_SYSCALL) += bpf_trace.o +obj-$(CONFIG_BPF_EVENTS) += bpf_trace.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o obj-$(CONFIG_TRACEPOINTS) += power-traces.o ifeq ($(CONFIG_PM),y) -- cgit v1.3-8-gc7d7 From e8c6deac69629c0cb97c3d3272f8631ef17f8f0f Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:10 +0200 Subject: perf: Add data_{offset,size} to user_page Currently, the actual perf ring buffer is one page into the mmap area, following the user page and the userspace follows this convention. This patch adds data_{offset,size} fields to user_page that can be used by userspace instead for locating perf data in the mmap area. This is also helpful when mapping existing or shared buffers if their size is not known in advance. Right now, it is made to follow the existing convention that data_offset == PAGE_SIZE and data_offset + data_size == mmap_size. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-2-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 5 +++++ kernel/events/core.c | 2 ++ 2 files changed, 7 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 91803e54ee73..86c44ae66d43 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -522,9 +522,14 @@ struct perf_event_mmap_page { * In this case the kernel will not over-write unread data. * * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. */ __u64 data_head; /* head in the data section */ __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ }; #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 5c13862d3e85..6efa516f1ab8 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4105,6 +4105,8 @@ static void perf_event_init_userpage(struct perf_event *event) /* Allow new userspace to detect that bit 0 is deprecated */ userpg->cap_bit0_is_deprecated = 1; userpg->size = offsetof(struct perf_event_mmap_page, __reserved); + userpg->data_offset = PAGE_SIZE; + userpg->data_size = perf_data_size(rb); unlock: rcu_read_unlock(); -- cgit v1.3-8-gc7d7 From 45bfb2e50471abbbfd83d40d28c986078b0d24ff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 14 Jan 2015 14:18:11 +0200 Subject: perf: Add AUX area to ring buffer for raw data streams This patch introduces "AUX space" in the perf mmap buffer, intended for exporting high bandwidth data streams to userspace, such as instruction flow traces. AUX space is a ring buffer, defined by aux_{offset,size} fields in the user_page structure, and read/write pointers aux_{head,tail}, which abide by the same rules as data_* counterparts of the main perf buffer. In order to allocate/mmap AUX, userspace needs to set up aux_offset to such an offset that will be greater than data_offset+data_size and aux_size to be the desired buffer size. Both need to be page aligned. Then, same aux_offset and aux_size should be passed to mmap() call and if everything adds up, you should have an AUX buffer as a result. Pages that are mapped into this buffer also come out of user's mlock rlimit plus perf_event_mlock_kb allowance. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Alexander Shishkin Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-3-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 17 +++++ include/uapi/linux/perf_event.h | 16 +++++ kernel/events/core.c | 141 +++++++++++++++++++++++++++++++++------- kernel/events/internal.h | 23 +++++++ kernel/events/ring_buffer.c | 97 +++++++++++++++++++++++++-- 5 files changed, 265 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 401554074de9..5a94f6d6fa91 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -284,6 +284,18 @@ struct pmu { * Return the count value for a counter. */ u64 (*count) (struct perf_event *event); /*optional*/ + + /* + * Set up pmu-private data structures for an AUX area + */ + void *(*setup_aux) (int cpu, void **pages, + int nr_pages, bool overwrite); + /* optional */ + + /* + * Free pmu-private AUX data structures + */ + void (*free_aux) (void *aux); /* optional */ }; /** @@ -862,6 +874,11 @@ static inline bool needs_branch_stack(struct perf_event *event) return event->attr.branch_sample_type != 0; } +static inline bool has_aux(struct perf_event *event) +{ + return event->pmu->setup_aux; +} + extern int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size); extern void perf_output_end(struct perf_output_handle *handle); diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 86c44ae66d43..6c5013a71714 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -530,6 +530,22 @@ struct perf_event_mmap_page { __u64 data_tail; /* user-space written tail */ __u64 data_offset; /* where the buffer starts */ __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; }; #define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) diff --git a/kernel/events/core.c b/kernel/events/core.c index 6efa516f1ab8..da51128c337a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4306,6 +4306,9 @@ static void perf_mmap_open(struct vm_area_struct *vma) atomic_inc(&event->mmap_count); atomic_inc(&event->rb->mmap_count); + if (vma->vm_pgoff) + atomic_inc(&event->rb->aux_mmap_count); + if (event->pmu->event_mapped) event->pmu->event_mapped(event); } @@ -4330,6 +4333,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (event->pmu->event_unmapped) event->pmu->event_unmapped(event); + /* + * rb->aux_mmap_count will always drop before rb->mmap_count and + * event->mmap_count, so it is ok to use event->mmap_mutex to + * serialize with perf_mmap here. + */ + if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff && + atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) { + atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= rb->aux_mmap_locked; + + rb_free_aux(rb); + mutex_unlock(&event->mmap_mutex); + } + atomic_dec(&rb->mmap_count); if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) @@ -4403,7 +4420,7 @@ out_put: static const struct vm_operations_struct perf_mmap_vmops = { .open = perf_mmap_open, - .close = perf_mmap_close, + .close = perf_mmap_close, /* non mergable */ .fault = perf_mmap_fault, .page_mkwrite = perf_mmap_fault, }; @@ -4414,10 +4431,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long user_locked, user_lock_limit; struct user_struct *user = current_user(); unsigned long locked, lock_limit; - struct ring_buffer *rb; + struct ring_buffer *rb = NULL; unsigned long vma_size; unsigned long nr_pages; - long user_extra, extra; + long user_extra = 0, extra = 0; int ret = 0, flags = 0; /* @@ -4432,7 +4449,66 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (vma->vm_pgoff == 0) { + nr_pages = (vma_size / PAGE_SIZE) - 1; + } else { + /* + * AUX area mapping: if rb->aux_nr_pages != 0, it's already + * mapped, all subsequent mappings should have the same size + * and offset. Must be above the normal perf buffer. + */ + u64 aux_offset, aux_size; + + if (!event->rb) + return -EINVAL; + + nr_pages = vma_size / PAGE_SIZE; + + mutex_lock(&event->mmap_mutex); + ret = -EINVAL; + + rb = event->rb; + if (!rb) + goto aux_unlock; + + aux_offset = ACCESS_ONCE(rb->user_page->aux_offset); + aux_size = ACCESS_ONCE(rb->user_page->aux_size); + + if (aux_offset < perf_data_size(rb) + PAGE_SIZE) + goto aux_unlock; + + if (aux_offset != vma->vm_pgoff << PAGE_SHIFT) + goto aux_unlock; + + /* already mapped with a different offset */ + if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff) + goto aux_unlock; + + if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE) + goto aux_unlock; + + /* already mapped with a different size */ + if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages) + goto aux_unlock; + + if (!is_power_of_2(nr_pages)) + goto aux_unlock; + + if (!atomic_inc_not_zero(&rb->mmap_count)) + goto aux_unlock; + + if (rb_has_aux(rb)) { + atomic_inc(&rb->aux_mmap_count); + ret = 0; + goto unlock; + } + + atomic_set(&rb->aux_mmap_count, 1); + user_extra = nr_pages; + + goto accounting; + } /* * If we have rb pages ensure they're a power-of-two number, so we @@ -4444,9 +4520,6 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - if (vma->vm_pgoff != 0) - return -EINVAL; - WARN_ON_ONCE(event->ctx->parent_ctx); again: mutex_lock(&event->mmap_mutex); @@ -4470,6 +4543,8 @@ again: } user_extra = nr_pages + 1; + +accounting: user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); /* @@ -4479,7 +4554,6 @@ again: user_locked = atomic_long_read(&user->locked_vm) + user_extra; - extra = 0; if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; @@ -4493,35 +4567,45 @@ again: goto unlock; } - WARN_ON(event->rb); + WARN_ON(!rb && event->rb); if (vma->vm_flags & VM_WRITE) flags |= RING_BUFFER_WRITABLE; - rb = rb_alloc(nr_pages, - event->attr.watermark ? event->attr.wakeup_watermark : 0, - event->cpu, flags); - if (!rb) { - ret = -ENOMEM; - goto unlock; - } + rb = rb_alloc(nr_pages, + event->attr.watermark ? event->attr.wakeup_watermark : 0, + event->cpu, flags); - atomic_set(&rb->mmap_count, 1); - rb->mmap_locked = extra; - rb->mmap_user = get_current_user(); + if (!rb) { + ret = -ENOMEM; + goto unlock; + } - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->pinned_vm += extra; + atomic_set(&rb->mmap_count, 1); + rb->mmap_user = get_current_user(); + rb->mmap_locked = extra; - ring_buffer_attach(event, rb); + ring_buffer_attach(event, rb); - perf_event_init_userpage(event); - perf_event_update_userpage(event); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + } else { + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags); + if (!ret) + rb->aux_mmap_locked = extra; + } unlock: - if (!ret) + if (!ret) { + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->pinned_vm += extra; + atomic_inc(&event->mmap_count); + } else if (rb) { + atomic_dec(&rb->mmap_count); + } +aux_unlock: mutex_unlock(&event->mmap_mutex); /* @@ -7506,6 +7590,13 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event) if (output_event->clock != event->clock) goto out; + /* + * If both events generate aux data, they must be on the same PMU + */ + if (has_aux(event) && has_aux(output_event) && + event->pmu != output_event->pmu) + goto out; + set: mutex_lock(&event->mmap_mutex); /* Can't redirect output if we've got an active mmap() */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 569b218782ad..0f6d08015927 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -35,6 +35,16 @@ struct ring_buffer { unsigned long mmap_locked; struct user_struct *mmap_user; + /* AUX area */ + unsigned long aux_pgoff; + int aux_nr_pages; + atomic_t aux_mmap_count; + unsigned long aux_mmap_locked; + void (*free_aux)(void *); + atomic_t aux_refcount; + void **aux_pages; + void *aux_priv; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -43,6 +53,14 @@ extern void rb_free(struct ring_buffer *rb); extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); +extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, int flags); +extern void rb_free_aux(struct ring_buffer *rb); + +static inline bool rb_has_aux(struct ring_buffer *rb) +{ + return !!rb->aux_nr_pages; +} extern void perf_event_header__init_id(struct perf_event_header *header, @@ -81,6 +99,11 @@ static inline unsigned long perf_data_size(struct ring_buffer *rb) return rb->nr_pages << (PAGE_SHIFT + page_order(rb)); } +static inline unsigned long perf_aux_size(struct ring_buffer *rb) +{ + return rb->aux_nr_pages << PAGE_SHIFT; +} + #define DEFINE_OUTPUT_COPY(func_name, memcpy_func) \ static inline unsigned long \ func_name(struct perf_output_handle *handle, \ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index eadb95ce7aac..3de9c4e9ea9f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -243,14 +243,87 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, + pgoff_t pgoff, int nr_pages, int flags) +{ + bool overwrite = !(flags & RING_BUFFER_WRITABLE); + int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); + int ret = -ENOMEM; + + if (!has_aux(event)) + return -ENOTSUPP; + + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); + if (!rb->aux_pages) + return -ENOMEM; + + rb->free_aux = event->pmu->free_aux; + for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages; + rb->aux_nr_pages++) { + struct page *page; + + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + goto out; + + rb->aux_pages[rb->aux_nr_pages] = page_address(page); + } + + rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, + overwrite); + if (!rb->aux_priv) + goto out; + + ret = 0; + + /* + * aux_pages (and pmu driver's private data, aux_priv) will be + * referenced in both producer's and consumer's contexts, thus + * we keep a refcount here to make sure either of the two can + * reference them safely. + */ + atomic_set(&rb->aux_refcount, 1); + +out: + if (!ret) + rb->aux_pgoff = pgoff; + else + rb_free_aux(rb); + + return ret; +} + +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + for (pg = 0; pg < rb->aux_nr_pages; pg++) + free_page((unsigned long)rb->aux_pages[pg]); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; +} + +void rb_free_aux(struct ring_buffer *rb) +{ + if (atomic_dec_and_test(&rb->aux_refcount)) + __rb_free_aux(rb); +} + #ifndef CONFIG_PERF_USE_VMALLOC /* * Back perf_mmap() with regular GFP_KERNEL-0 pages. */ -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { if (pgoff > rb->nr_pages) return NULL; @@ -340,8 +413,8 @@ static int data_page_nr(struct ring_buffer *rb) return rb->nr_pages << page_order(rb); } -struct page * -perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +static struct page * +__perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) { /* The '>' counts in the user page. */ if (pgoff > data_page_nr(rb)) @@ -416,3 +489,19 @@ fail: } #endif + +struct page * +perf_mmap_to_page(struct ring_buffer *rb, unsigned long pgoff) +{ + if (rb->aux_nr_pages) { + /* above AUX space */ + if (pgoff > rb->aux_pgoff + rb->aux_nr_pages) + return NULL; + + /* AUX space */ + if (pgoff >= rb->aux_pgoff) + return virt_to_page(rb->aux_pages[pgoff - rb->aux_pgoff]); + } + + return __perf_mmap_to_page(rb, pgoff); +} -- cgit v1.3-8-gc7d7 From 0a4e38e64f5e91ce131cc42ee5bb3925377ec840 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:12 +0200 Subject: perf: Support high-order allocations for AUX space Some pmus (such as BTS or Intel PT without multiple-entry ToPA capability) don't support scatter-gather and will prefer larger contiguous areas for their output regions. This patch adds a new pmu capability to request higher order allocations. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-4-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/ring_buffer.c | 56 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 51 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 5a94f6d6fa91..d5a4a8e95808 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -174,6 +174,7 @@ struct perf_event; */ #define PERF_PMU_CAP_NO_INTERRUPT 0x01 #define PERF_PMU_CAP_NO_NMI 0x02 +#define PERF_PMU_CAP_AUX_NO_SG 0x04 /** * struct pmu - generic performance monitoring unit diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3de9c4e9ea9f..ed0859e33b2f 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -243,30 +243,74 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +#define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) + +static struct page *rb_alloc_aux_page(int node, int order) +{ + struct page *page; + + if (order > MAX_ORDER) + order = MAX_ORDER; + + do { + page = alloc_pages_node(node, PERF_AUX_GFP, order); + } while (!page && order--); + + if (page && order) { + /* + * Communicate the allocation size to the driver + */ + split_page(page, order); + SetPagePrivate(page); + set_page_private(page, order); + } + + return page; +} + +static void rb_free_aux_page(struct ring_buffer *rb, int idx) +{ + struct page *page = virt_to_page(rb->aux_pages[idx]); + + ClearPagePrivate(page); + page->mapping = NULL; + __free_page(page); +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); - int ret = -ENOMEM; + int ret = -ENOMEM, max_order = 0; if (!has_aux(event)) return -ENOTSUPP; + if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) + /* + * We need to start with the max_order that fits in nr_pages, + * not the other way around, hence ilog2() and not get_order. + */ + max_order = ilog2(nr_pages); + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) return -ENOMEM; rb->free_aux = event->pmu->free_aux; - for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages; - rb->aux_nr_pages++) { + for (rb->aux_nr_pages = 0; rb->aux_nr_pages < nr_pages;) { struct page *page; + int last, order; - page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + order = min(max_order, ilog2(nr_pages - rb->aux_nr_pages)); + page = rb_alloc_aux_page(node, order); if (!page) goto out; - rb->aux_pages[rb->aux_nr_pages] = page_address(page); + for (last = rb->aux_nr_pages + (1 << page_private(page)); + last > rb->aux_nr_pages; rb->aux_nr_pages++) + rb->aux_pages[rb->aux_nr_pages] = page_address(page++); } rb->aux_priv = event->pmu->setup_aux(event->cpu, rb->aux_pages, nr_pages, @@ -304,7 +348,7 @@ static void __rb_free_aux(struct ring_buffer *rb) } for (pg = 0; pg < rb->aux_nr_pages; pg++) - free_page((unsigned long)rb->aux_pages[pg]); + rb_free_aux_page(rb, pg); kfree(rb->aux_pages); rb->aux_nr_pages = 0; -- cgit v1.3-8-gc7d7 From 6a279230391b63130070e0219b0ad09d34d28c89 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:13 +0200 Subject: perf: Add a capability for AUX_NO_SG pmus to do software double buffering For pmus that don't support scatter-gather for AUX data in hardware, it might still make sense to implement software double buffering to avoid losing data while the user is reading data out. For this purpose, add a pmu capability that guarantees multiple high-order chunks for AUX buffer, so that the pmu driver can do switchover tricks. To make use of this feature, add PERF_PMU_CAP_AUX_SW_DOUBLEBUF to your pmu's capability mask. This will make the ring buffer AUX allocation code ensure that the biggest high order allocation for the aux buffer pages is no bigger than half of the total requested buffer size, thus making sure that the buffer has at least two high order allocations. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-5-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 1 + kernel/events/ring_buffer.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d5a4a8e95808..13a1eb3a2a2d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -175,6 +175,7 @@ struct perf_event; #define PERF_PMU_CAP_NO_INTERRUPT 0x01 #define PERF_PMU_CAP_NO_NMI 0x02 #define PERF_PMU_CAP_AUX_NO_SG 0x04 +#define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 /** * struct pmu - generic performance monitoring unit diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index ed0859e33b2f..6e3be7a10c50 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -287,13 +287,26 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, if (!has_aux(event)) return -ENOTSUPP; - if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) + if (event->pmu->capabilities & PERF_PMU_CAP_AUX_NO_SG) { /* * We need to start with the max_order that fits in nr_pages, * not the other way around, hence ilog2() and not get_order. */ max_order = ilog2(nr_pages); + /* + * PMU requests more than one contiguous chunks of memory + * for SW double buffering + */ + if ((event->pmu->capabilities & PERF_PMU_CAP_AUX_SW_DOUBLEBUF) && + !overwrite) { + if (!max_order) + return -EINVAL; + + max_order--; + } + } + rb->aux_pages = kzalloc_node(nr_pages * sizeof(void *), GFP_KERNEL, node); if (!rb->aux_pages) return -ENOMEM; -- cgit v1.3-8-gc7d7 From bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Fri, 30 Jan 2015 12:31:06 +0200 Subject: perf: Add a pmu capability for "exclusive" events Usually, pmus that do, for example, instruction tracing, would only ever be able to have one event per task per cpu (or per perf_event_context). For such pmus it makes sense to disallow creating conflicting events early on, so as to provide consistent behavior for the user. This patch adds a pmu capability that indicates such constraint on event creation. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1422613866-113186-1-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 + kernel/events/core.c | 119 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 119 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 13a1eb3a2a2d..f936a1e51f29 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -176,6 +176,7 @@ struct perf_event; #define PERF_PMU_CAP_NO_NMI 0x02 #define PERF_PMU_CAP_AUX_NO_SG 0x04 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 +#define PERF_PMU_CAP_EXCLUSIVE 0x10 /** * struct pmu - generic performance monitoring unit @@ -196,6 +197,7 @@ struct pmu { int * __percpu pmu_disable_count; struct perf_cpu_context * __percpu pmu_cpu_context; + atomic_t exclusive_cnt; /* < 0: cpu; > 0: tsk */ int task_ctx_nr; int hrtimer_interval_ms; diff --git a/kernel/events/core.c b/kernel/events/core.c index da51128c337a..6d9fdaef7b57 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3459,6 +3459,91 @@ static void unaccount_event(struct perf_event *event) unaccount_event_cpu(event, event->cpu); } +/* + * The following implement mutual exclusion of events on "exclusive" pmus + * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled + * at a time, so we disallow creating events that might conflict, namely: + * + * 1) cpu-wide events in the presence of per-task events, + * 2) per-task events in the presence of cpu-wide events, + * 3) two matching events on the same context. + * + * The former two cases are handled in the allocation path (perf_event_alloc(), + * __free_event()), the latter -- before the first perf_install_in_context(). + */ +static int exclusive_event_init(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return 0; + + /* + * Prevent co-existence of per-task and cpu-wide events on the + * same exclusive pmu. + * + * Negative pmu::exclusive_cnt means there are cpu-wide + * events on this "exclusive" pmu, positive means there are + * per-task events. + * + * Since this is called in perf_event_alloc() path, event::ctx + * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK + * to mean "per-task event", because unlike other attach states it + * never gets cleared. + */ + if (event->attach_state & PERF_ATTACH_TASK) { + if (!atomic_inc_unless_negative(&pmu->exclusive_cnt)) + return -EBUSY; + } else { + if (!atomic_dec_unless_positive(&pmu->exclusive_cnt)) + return -EBUSY; + } + + return 0; +} + +static void exclusive_event_destroy(struct perf_event *event) +{ + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return; + + /* see comment in exclusive_event_init() */ + if (event->attach_state & PERF_ATTACH_TASK) + atomic_dec(&pmu->exclusive_cnt); + else + atomic_inc(&pmu->exclusive_cnt); +} + +static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2) +{ + if ((e1->pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && + (e1->cpu == e2->cpu || + e1->cpu == -1 || + e2->cpu == -1)) + return true; + return false; +} + +/* Called under the same ctx::mutex as perf_install_in_context() */ +static bool exclusive_event_installable(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *iter_event; + struct pmu *pmu = event->pmu; + + if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE)) + return true; + + list_for_each_entry(iter_event, &ctx->event_list, event_entry) { + if (exclusive_event_match(iter_event, event)) + return false; + } + + return true; +} + static void __free_event(struct perf_event *event) { if (!event->parent) { @@ -3472,8 +3557,10 @@ static void __free_event(struct perf_event *event) if (event->ctx) put_ctx(event->ctx); - if (event->pmu) + if (event->pmu) { + exclusive_event_destroy(event); module_put(event->pmu->module); + } call_rcu(&event->rcu_head, free_event_rcu); } @@ -7150,6 +7237,7 @@ got_cpu_context: pmu->event_idx = perf_event_idx_default; list_add_rcu(&pmu->entry, &pmus); + atomic_set(&pmu->exclusive_cnt, 0); ret = 0; unlock: mutex_unlock(&pmus_lock); @@ -7405,16 +7493,23 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, goto err_ns; } + err = exclusive_event_init(event); + if (err) + goto err_pmu; + if (!event->parent) { if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { err = get_callchain_buffers(); if (err) - goto err_pmu; + goto err_per_task; } } return event; +err_per_task: + exclusive_event_destroy(event); + err_pmu: if (event->destroy) event->destroy(event); @@ -7819,6 +7914,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_alloc; } + if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) { + err = -EBUSY; + goto err_context; + } + if (task) { put_task_struct(task); task = NULL; @@ -7941,6 +8041,13 @@ SYSCALL_DEFINE5(perf_event_open, get_ctx(ctx); } + if (!exclusive_event_installable(event, ctx)) { + err = -EBUSY; + mutex_unlock(&ctx->mutex); + fput(event_file); + goto err_context; + } + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8032,6 +8139,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (!exclusive_event_installable(event, ctx)) { + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); + err = -EBUSY; + goto err_free; + } + perf_install_in_context(ctx, event, cpu); perf_unpin_context(ctx); mutex_unlock(&ctx->mutex); -- cgit v1.3-8-gc7d7 From 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:15 +0200 Subject: perf: Add AUX record When there's new data in the AUX space, output a record indicating its offset and size and a set of flags, such as PERF_AUX_FLAG_TRUNCATED, to mean the described data was truncated to fit in the ring buffer. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-7-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 19 +++++++++++++++++++ kernel/events/core.c | 34 ++++++++++++++++++++++++++++++++++ kernel/events/internal.h | 3 +++ 3 files changed, 56 insertions(+) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 6c5013a71714..8904ad3a850b 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -768,6 +768,20 @@ enum perf_event_type { */ PERF_RECORD_MMAP2 = 10, + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -785,6 +799,11 @@ enum perf_callchain_context { PERF_CONTEXT_MAX = (__u64)-4095, }; +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ + #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ diff --git a/kernel/events/core.c b/kernel/events/core.c index 6d9fdaef7b57..dbc2eff32230 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -5916,6 +5916,40 @@ void perf_event_mmap(struct vm_area_struct *vma) perf_event_mmap_event(&mmap_event); } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u64 offset; + u64 size; + u64 flags; + } rec = { + .header = { + .type = PERF_RECORD_AUX, + .misc = 0, + .size = sizeof(rec), + }, + .offset = head, + .size = size, + .flags = flags, + }; + int ret; + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * IRQ throttle logging */ diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 0f6d08015927..4d117a981431 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -62,6 +62,9 @@ static inline bool rb_has_aux(struct ring_buffer *rb) return !!rb->aux_nr_pages; } +void perf_event_aux_event(struct perf_event *event, unsigned long head, + unsigned long size, u64 flags); + extern void perf_event_header__init_id(struct perf_event_header *header, struct perf_sample_data *data, -- cgit v1.3-8-gc7d7 From fdc2670666f40ab3e03143f04d1ebf4a05e2c24a Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:16 +0200 Subject: perf: Add API for PMUs to write to the AUX area For pmus that wish to write data to ring buffer's AUX area, provide perf_aux_output_{begin,end}() calls to initiate/commit data writes, similarly to perf_output_{begin,end}. These also use the same output handle structure. Also, similarly to software counterparts, these will direct inherited events' output to parents' ring buffers. After the perf_aux_output_begin() returns successfully, handle->size is set to the maximum amount of data that can be written wrt aux_tail pointer, so that no data that the user hasn't seen will be overwritten, therefore this should always be called before hardware writing is enabled. On success, this will return the pointer to pmu driver's private structure allocated for this aux area by pmu::setup_aux. Same pointer can also be retrieved using perf_get_aux() while hardware writing is enabled. PMU driver should pass the actual amount of data written as a parameter to perf_aux_output_end(). All hardware writes should be completed and visible before this one is called. Additionally, perf_aux_output_skip() will adjust output handle and aux_head in case some part of the buffer has to be skipped over to maintain hardware's alignment constraints. Nested writers are forbidden and guards are in place to catch such attempts. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-8-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 24 +++++++- kernel/events/core.c | 5 +- kernel/events/internal.h | 4 ++ kernel/events/ring_buffer.c | 139 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 168 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f936a1e51f29..45c5873ad9b3 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -573,7 +573,10 @@ struct perf_output_handle { struct ring_buffer *rb; unsigned long wakeup; unsigned long size; - void *addr; + union { + void *addr; + unsigned long head; + }; int page; }; @@ -608,6 +611,14 @@ perf_cgroup_from_task(struct task_struct *task) #ifdef CONFIG_PERF_EVENTS +extern void *perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event); +extern void perf_aux_output_end(struct perf_output_handle *handle, + unsigned long size, bool truncated); +extern int perf_aux_output_skip(struct perf_output_handle *handle, + unsigned long size); +extern void *perf_get_aux(struct perf_output_handle *handle); + extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); @@ -898,6 +909,17 @@ extern void perf_event_disable(struct perf_event *event); extern int __perf_event_disable(void *info); extern void perf_event_task_tick(void); #else /* !CONFIG_PERF_EVENTS: */ +static inline void * +perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event) { return NULL; } +static inline void +perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, + bool truncated) { } +static inline int +perf_aux_output_skip(struct perf_output_handle *handle, + unsigned long size) { return -EINVAL; } +static inline void * +perf_get_aux(struct perf_output_handle *handle) { return NULL; } static inline void perf_event_task_sched_in(struct task_struct *prev, struct task_struct *task) { } diff --git a/kernel/events/core.c b/kernel/events/core.c index dbc2eff32230..81e8d14ac59a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3423,7 +3423,6 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); static void ring_buffer_attach(struct perf_event *event, struct ring_buffer *rb); @@ -4361,7 +4360,7 @@ static void rb_free_rcu(struct rcu_head *rcu_head) rb_free(rb); } -static struct ring_buffer *ring_buffer_get(struct perf_event *event) +struct ring_buffer *ring_buffer_get(struct perf_event *event) { struct ring_buffer *rb; @@ -4376,7 +4375,7 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +void ring_buffer_put(struct ring_buffer *rb) { if (!atomic_dec_and_test(&rb->refcount)) return; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 4d117a981431..b701ebc32570 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -36,6 +36,8 @@ struct ring_buffer { struct user_struct *mmap_user; /* AUX area */ + local_t aux_head; + local_t aux_nest; unsigned long aux_pgoff; int aux_nr_pages; atomic_t aux_mmap_count; @@ -56,6 +58,8 @@ extern void perf_event_wakeup(struct perf_event *event); extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, int flags); extern void rb_free_aux(struct ring_buffer *rb); +extern struct ring_buffer *ring_buffer_get(struct perf_event *event); +extern void ring_buffer_put(struct ring_buffer *rb); static inline bool rb_has_aux(struct ring_buffer *rb) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 6e3be7a10c50..0cc7b0f39058 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -243,6 +243,145 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) spin_lock_init(&rb->event_lock); } +/* + * This is called before hardware starts writing to the AUX area to + * obtain an output handle and make sure there's room in the buffer. + * When the capture completes, call perf_aux_output_end() to commit + * the recorded data to the buffer. + * + * The ordering is similar to that of perf_output_{begin,end}, with + * the exception of (B), which should be taken care of by the pmu + * driver, since ordering rules will differ depending on hardware. + */ +void *perf_aux_output_begin(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *output_event = event; + unsigned long aux_head, aux_tail; + struct ring_buffer *rb; + + if (output_event->parent) + output_event = output_event->parent; + + /* + * Since this will typically be open across pmu::add/pmu::del, we + * grab ring_buffer's refcount instead of holding rcu read lock + * to make sure it doesn't disappear under us. + */ + rb = ring_buffer_get(output_event); + if (!rb) + return NULL; + + if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount)) + goto err; + + /* + * Nesting is not supported for AUX area, make sure nested + * writers are caught early + */ + if (WARN_ON_ONCE(local_xchg(&rb->aux_nest, 1))) + goto err_put; + + aux_head = local_read(&rb->aux_head); + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + + handle->rb = rb; + handle->event = event; + handle->head = aux_head; + if (aux_head - aux_tail < perf_aux_size(rb)) + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); + else + handle->size = 0; + + /* + * handle->size computation depends on aux_tail load; this forms a + * control dependency barrier separating aux_tail load from aux data + * store that will be enabled on successful return + */ + if (!handle->size) { /* A, matches D */ + event->pending_disable = 1; + perf_output_wakeup(handle); + local_set(&rb->aux_nest, 0); + goto err_put; + } + + return handle->rb->aux_priv; + +err_put: + rb_free_aux(rb); + +err: + ring_buffer_put(rb); + handle->event = NULL; + + return NULL; +} + +/* + * Commit the data written by hardware into the ring buffer by adjusting + * aux_head and posting a PERF_RECORD_AUX into the perf buffer. It is the + * pmu driver's responsibility to observe ordering rules of the hardware, + * so that all the data is externally visible before this is called. + */ +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, + bool truncated) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head = local_read(&rb->aux_head); + u64 flags = 0; + + if (truncated) + flags |= PERF_AUX_FLAG_TRUNCATED; + + local_add(size, &rb->aux_head); + + if (size || flags) { + /* + * Only send RECORD_AUX if we have something useful to communicate + */ + + perf_event_aux_event(handle->event, aux_head, size, flags); + } + + rb->user_page->aux_head = local_read(&rb->aux_head); + + perf_output_wakeup(handle); + handle->event = NULL; + + local_set(&rb->aux_nest, 0); + rb_free_aux(rb); + ring_buffer_put(rb); +} + +/* + * Skip over a given number of bytes in the AUX buffer, due to, for example, + * hardware's alignment constraints. + */ +int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) +{ + struct ring_buffer *rb = handle->rb; + unsigned long aux_head; + + if (size > handle->size) + return -ENOSPC; + + local_add(size, &rb->aux_head); + + handle->head = aux_head; + handle->size -= size; + + return 0; +} + +void *perf_get_aux(struct perf_output_handle *handle) +{ + /* this is only valid between perf_aux_output_begin and *_end */ + if (!handle->event) + return NULL; + + return handle->rb->aux_priv; +} + #define PERF_AUX_GFP (GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY) static struct page *rb_alloc_aux_page(int node, int order) -- cgit v1.3-8-gc7d7 From 2023a0d2829e521fe6ad6b9907f3f90bfbf57142 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:17 +0200 Subject: perf: Support overwrite mode for the AUX area This adds support for overwrite mode in the AUX area, which means "keep collecting data till you're stopped", turning AUX area into a circular buffer, where new data overwrites old data. It does not depend on data buffer's overwrite mode, so that it doesn't lose sideband data that is instrumental for processing AUX data. Overwrite mode is enabled at mapping AUX area read only. Even though aux_tail in the buffer's user page might be user writable, it will be ignored in this mode. A PERF_RECORD_AUX with PERF_AUX_FLAG_OVERWRITE set is written to the perf data stream every time an event writes new data to the AUX area. The pmu driver might not be able to infer the exact beginning of the new data in each snapshot, some drivers will only provide the tail, which is aux_offset + aux_size in the AUX record. Consumer has to be able to tell the new data from the old one, for example, by means of time stamps if such are provided in the trace. Consumer is also responsible for disabling any events that might write to the AUX area (thus potentially racing with the consumer) before collecting the data. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-9-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 1 + kernel/events/internal.h | 1 + kernel/events/ring_buffer.c | 48 ++++++++++++++++++++++++++++------------- 3 files changed, 35 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 8904ad3a850b..29ef2f73bb4a 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -803,6 +803,7 @@ enum perf_callchain_context { * PERF_RECORD_AUX::flags bits */ #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index b701ebc32570..ffd51d9f5945 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -40,6 +40,7 @@ struct ring_buffer { local_t aux_nest; unsigned long aux_pgoff; int aux_nr_pages; + int aux_overwrite; atomic_t aux_mmap_count; unsigned long aux_mmap_locked; void (*free_aux)(void *); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 0cc7b0f39058..67b328337a41 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -283,26 +283,33 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, goto err_put; aux_head = local_read(&rb->aux_head); - aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); handle->rb = rb; handle->event = event; handle->head = aux_head; - if (aux_head - aux_tail < perf_aux_size(rb)) - handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); - else - handle->size = 0; + handle->size = 0; /* - * handle->size computation depends on aux_tail load; this forms a - * control dependency barrier separating aux_tail load from aux data - * store that will be enabled on successful return + * In overwrite mode, AUX data stores do not depend on aux_tail, + * therefore (A) control dependency barrier does not exist. The + * (B) <-> (C) ordering is still observed by the pmu driver. */ - if (!handle->size) { /* A, matches D */ - event->pending_disable = 1; - perf_output_wakeup(handle); - local_set(&rb->aux_nest, 0); - goto err_put; + if (!rb->aux_overwrite) { + aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + if (aux_head - aux_tail < perf_aux_size(rb)) + handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); + + /* + * handle->size computation depends on aux_tail load; this forms a + * control dependency barrier separating aux_tail load from aux data + * store that will be enabled on successful return + */ + if (!handle->size) { /* A, matches D */ + event->pending_disable = 1; + perf_output_wakeup(handle); + local_set(&rb->aux_nest, 0); + goto err_put; + } } return handle->rb->aux_priv; @@ -327,13 +334,22 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; - unsigned long aux_head = local_read(&rb->aux_head); + unsigned long aux_head; u64 flags = 0; if (truncated) flags |= PERF_AUX_FLAG_TRUNCATED; - local_add(size, &rb->aux_head); + /* in overwrite mode, driver provides aux_head via handle */ + if (rb->aux_overwrite) { + flags |= PERF_AUX_FLAG_OVERWRITE; + + aux_head = handle->head; + local_set(&rb->aux_head, aux_head); + } else { + aux_head = local_read(&rb->aux_head); + local_add(size, &rb->aux_head); + } if (size || flags) { /* @@ -480,6 +496,8 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, */ atomic_set(&rb->aux_refcount, 1); + rb->aux_overwrite = overwrite; + out: if (!ret) rb->aux_pgoff = pgoff; -- cgit v1.3-8-gc7d7 From 1a5941312414c71dece6717da9a0fa1303127afa Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:18 +0200 Subject: perf: Add wakeup watermark control to the AUX area When AUX area gets a certain amount of new data, we want to wake up userspace to collect it. This adds a new control to specify how much data will cause a wakeup. This is then passed down to pmu drivers via output handle's "wakeup" field, so that the driver can find the nearest point where it can generate an interrupt. We repurpose __reserved_2 in the event attribute for this, even though it was never checked to be zero before, aux_watermark will only matter for new AUX-aware code, so the old code should still be fine. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-10-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/uapi/linux/perf_event.h | 7 +++++++ kernel/events/core.c | 3 ++- kernel/events/internal.h | 4 +++- kernel/events/ring_buffer.c | 22 +++++++++++++++++++--- 4 files changed, 31 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 29ef2f73bb4a..84819546c8ce 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -261,6 +261,7 @@ enum perf_event_read_format { #define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ /* add: sample_stack_user */ #define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ /* * Hardware event_id to monitor via a performance monitoring event: @@ -366,6 +367,12 @@ struct perf_event_attr { * See asm/perf_regs.h for details. */ __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u32 __reserved_2; /* align to __u64 */ }; #define perf_flags(attr) (*(&(attr)->read_format + 1)) diff --git a/kernel/events/core.c b/kernel/events/core.c index 81e8d14ac59a..31f6b504ad62 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4677,7 +4677,8 @@ accounting: perf_event_init_userpage(event); perf_event_update_userpage(event); } else { - ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, flags); + ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages, + event->attr.aux_watermark, flags); if (!ret) rb->aux_mmap_locked = extra; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index ffd51d9f5945..9f6ce9ba4a04 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -27,6 +27,7 @@ struct ring_buffer { local_t lost; /* nr records lost */ long watermark; /* wakeup watermark */ + long aux_watermark; /* poll crap */ spinlock_t event_lock; struct list_head event_list; @@ -38,6 +39,7 @@ struct ring_buffer { /* AUX area */ local_t aux_head; local_t aux_nest; + local_t aux_wakeup; unsigned long aux_pgoff; int aux_nr_pages; int aux_overwrite; @@ -57,7 +59,7 @@ extern struct ring_buffer * rb_alloc(int nr_pages, long watermark, int cpu, int flags); extern void perf_event_wakeup(struct perf_event *event); extern int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, - pgoff_t pgoff, int nr_pages, int flags); + pgoff_t pgoff, int nr_pages, long watermark, int flags); extern void rb_free_aux(struct ring_buffer *rb); extern struct ring_buffer *ring_buffer_get(struct perf_event *event); extern void ring_buffer_put(struct ring_buffer *rb); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 67b328337a41..232f00f273cb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -296,6 +296,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, */ if (!rb->aux_overwrite) { aux_tail = ACCESS_ONCE(rb->user_page->aux_tail); + handle->wakeup = local_read(&rb->aux_wakeup) + rb->aux_watermark; if (aux_head - aux_tail < perf_aux_size(rb)) handle->size = CIRC_SPACE(aux_head, aux_tail, perf_aux_size(rb)); @@ -359,9 +360,12 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, perf_event_aux_event(handle->event, aux_head, size, flags); } - rb->user_page->aux_head = local_read(&rb->aux_head); + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); - perf_output_wakeup(handle); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + } handle->event = NULL; local_set(&rb->aux_nest, 0); @@ -383,6 +387,14 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) local_add(size, &rb->aux_head); + aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); + if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { + perf_output_wakeup(handle); + local_add(rb->aux_watermark, &rb->aux_wakeup); + handle->wakeup = local_read(&rb->aux_wakeup) + + rb->aux_watermark; + } + handle->head = aux_head; handle->size -= size; @@ -433,7 +445,7 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) } int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, - pgoff_t pgoff, int nr_pages, int flags) + pgoff_t pgoff, int nr_pages, long watermark, int flags) { bool overwrite = !(flags & RING_BUFFER_WRITABLE); int node = (event->cpu == -1) ? -1 : cpu_to_node(event->cpu); @@ -497,6 +509,10 @@ int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, atomic_set(&rb->aux_refcount, 1); rb->aux_overwrite = overwrite; + rb->aux_watermark = watermark; + + if (!rb->aux_watermark && !rb->aux_overwrite) + rb->aux_watermark = nr_pages << (PAGE_SHIFT - 1); out: if (!ret) -- cgit v1.3-8-gc7d7 From ec0d7729bbaed4b9d2d3fada693278e13a3d1368 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Wed, 14 Jan 2015 14:18:23 +0200 Subject: perf: Add ITRACE_START record to indicate that tracing has started For counters that generate AUX data that is bound to the context of a running task, such as instruction tracing, the decoder needs to know exactly which task is running when the event is first scheduled in, before the first sched_switch. The decoder's need to know this stems from the fact that instruction flow trace decoding will almost always require program's object code in order to reconstruct said flow and for that we need at least its pid/tid in the perf stream. To single out such instruction tracing pmus, this patch introduces ITRACE PMU capability. The reason this is not part of RECORD_AUX record is that not all pmus capable of generating AUX data need this, and the opposite is *probably* also true. While sched_switch covers for most cases, there are two problems with it: the consumer will need to process events out of order (that is, having found RECORD_AUX, it will have to skip forward to the nearest sched_switch to figure out which task it was, then go back to the actual trace to decode it) and it completely misses the case when the tracing is enabled and disabled before sched_switch, for example, via PERF_EVENT_IOC_DISABLE. Signed-off-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kaixu Xia Cc: Linus Torvalds Cc: Mike Galbraith Cc: Paul Mackerras Cc: Robert Richter Cc: Stephane Eranian Cc: Thomas Gleixner Cc: acme@infradead.org Cc: adrian.hunter@intel.com Cc: kan.liang@intel.com Cc: markus.t.metzger@intel.com Cc: mathieu.poirier@linaro.org Link: http://lkml.kernel.org/r/1421237903-181015-15-git-send-email-alexander.shishkin@linux.intel.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 4 ++++ include/uapi/linux/perf_event.h | 11 +++++++++++ kernel/events/core.c | 41 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 45c5873ad9b3..61992cf2e977 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -129,6 +129,9 @@ struct hw_perf_event { struct list_head cqm_groups_entry; struct list_head cqm_group_entry; }; + struct { /* itrace */ + int itrace_started; + }; #ifdef CONFIG_HAVE_HW_BREAKPOINT struct { /* breakpoint */ /* @@ -177,6 +180,7 @@ struct perf_event; #define PERF_PMU_CAP_AUX_NO_SG 0x04 #define PERF_PMU_CAP_AUX_SW_DOUBLEBUF 0x08 #define PERF_PMU_CAP_EXCLUSIVE 0x10 +#define PERF_PMU_CAP_ITRACE 0x20 /** * struct pmu - generic performance monitoring unit diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 84819546c8ce..309211b3eb67 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -789,6 +789,17 @@ enum perf_event_type { */ PERF_RECORD_AUX = 11, + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + PERF_RECORD_MAX, /* non-ABI */ }; diff --git a/kernel/events/core.c b/kernel/events/core.c index 31f6b504ad62..06917d537302 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1831,6 +1831,7 @@ static void perf_set_shadow_time(struct perf_event *event, #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); +static void perf_log_itrace_start(struct perf_event *event); static int event_sched_in(struct perf_event *event, @@ -1869,6 +1870,8 @@ event_sched_in(struct perf_event *event, perf_set_shadow_time(event, ctx, tstamp); + perf_log_itrace_start(event); + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; @@ -5991,6 +5994,44 @@ static void perf_log_throttle(struct perf_event *event, int enable) perf_output_end(&handle); } +static void perf_log_itrace_start(struct perf_event *event) +{ + struct perf_output_handle handle; + struct perf_sample_data sample; + struct perf_aux_event { + struct perf_event_header header; + u32 pid; + u32 tid; + } rec; + int ret; + + if (event->parent) + event = event->parent; + + if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) || + event->hw.itrace_started) + return; + + event->hw.itrace_started = 1; + + rec.header.type = PERF_RECORD_ITRACE_START; + rec.header.misc = 0; + rec.header.size = sizeof(rec); + rec.pid = perf_event_pid(event, current); + rec.tid = perf_event_tid(event, current); + + perf_event_header__init_id(&rec.header, &sample, event); + ret = perf_output_begin(&handle, event, rec.header.size); + + if (ret) + return; + + perf_output_put(&handle, rec); + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + /* * Generic event overflow handling, sampling. */ -- cgit v1.3-8-gc7d7 From b3738d29323344da3017a91010530cf3a58590fc Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 17 Nov 2014 20:07:03 +0100 Subject: watchdog: Add watchdog enable/disable all functions This patch adds two new functions to enable/disable the watchdog across all CPUs. This will be used by the HT PMU bug workaround code to disable/enable the NMI watchdog across quirk enablement. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: bp@alien8.de Cc: jolsa@redhat.com Cc: kan.liang@intel.com Cc: maria.n.dimakopoulou@gmail.com Cc: Frederic Weisbecker Cc: Don Zickus Cc: Andrew Morton Link: http://lkml.kernel.org/r/1416251225-17721-12-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- include/linux/watchdog.h | 8 ++++++++ kernel/watchdog.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+) (limited to 'kernel') diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 395b70e0eccf..a746bf5216f8 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -137,4 +137,12 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd, extern int watchdog_register_device(struct watchdog_device *); extern void watchdog_unregister_device(struct watchdog_device *); +#ifdef CONFIG_HARDLOCKUP_DETECTOR +void watchdog_nmi_disable_all(void); +void watchdog_nmi_enable_all(void); +#else +static inline void watchdog_nmi_disable_all(void) {} +static inline void watchdog_nmi_enable_all(void) {} +#endif + #endif /* ifndef _LINUX_WATCHDOG_H */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 3174bf8e3538..9a056f5bc02c 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -567,9 +567,37 @@ static void watchdog_nmi_disable(unsigned int cpu) cpu0_err = 0; } } + +void watchdog_nmi_enable_all(void) +{ + int cpu; + + if (!watchdog_user_enabled) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_enable(cpu); + put_online_cpus(); +} + +void watchdog_nmi_disable_all(void) +{ + int cpu; + + if (!watchdog_running) + return; + + get_online_cpus(); + for_each_online_cpu(cpu) + watchdog_nmi_disable(cpu); + put_online_cpus(); +} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } +void watchdog_nmi_enable_all(void) {} +void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { -- cgit v1.3-8-gc7d7 From 07c54f7a7ff77bb47bae26e566969e9c4b6fb0c6 Mon Sep 17 00:00:00 2001 From: Abel Vesa Date: Tue, 3 Mar 2015 13:50:27 +0200 Subject: sched/core: Remove unused argument from init_[rt|dl]_rq() Obviously, 'rq' is not used in these two functions, therefore, there is no reason for it to be passed as an argument. Signed-off-by: Abel Vesa Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1425383427-26244-1-git-send-email-abelvesa@gmail.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/deadline.c | 2 +- kernel/sched/rt.c | 4 ++-- kernel/sched/sched.h | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b3b6887c6b1..4c49e75ca24d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7164,8 +7164,8 @@ void __init sched_init(void) rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs); - init_rt_rq(&rq->rt, rq); - init_dl_rq(&rq->dl, rq); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = ROOT_TASK_GROUP_LOAD; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 24c18dc10fd7..5e2f99bd5ce0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -69,7 +69,7 @@ void init_dl_bw(struct dl_bw *dl_b) dl_b->total_bw = 0; } -void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq) +void init_dl_rq(struct dl_rq *dl_rq) { dl_rq->rb_root = RB_ROOT; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index ad0241561c3e..575da76a3874 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -64,7 +64,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) static void push_irq_work_func(struct irq_work *work); #endif -void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +void init_rt_rq(struct rt_rq *rt_rq) { struct rt_prio_array *array; int i; @@ -205,7 +205,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; - init_rt_rq(rt_rq, cpu_rq(i)); + init_rt_rq(rt_rq); rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 91c6736d2522..e0e129993958 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1671,8 +1671,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); extern void init_cfs_rq(struct cfs_rq *cfs_rq); -extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq); -extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq); +extern void init_rt_rq(struct rt_rq *rt_rq); +extern void init_dl_rq(struct dl_rq *dl_rq); extern void cfs_bandwidth_usage_inc(void); extern void cfs_bandwidth_usage_dec(void); -- cgit v1.3-8-gc7d7 From 4cd57f97135840f637431c92380c8da3edbe44ed Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 31 Mar 2015 09:53:36 +0100 Subject: sched/deadline: Always enqueue on previous rq when dl_task_timer() fires dl_task_timer() may fire on a different rq from where a task was removed after throttling. Since the call path is: dl_task_timer() -> enqueue_task_dl() -> enqueue_dl_entity() -> replenish_dl_entity() and replenish_dl_entity() uses dl_se's rq, we can't use current's rq in dl_task_timer(), but we need to lock the task's previous one. Tested-by: Wanpeng Li Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Acked-by: Kirill Tkhai Cc: Juri Lelli Fixes: 3960c8c0c789 ("sched: Make dl_task_time() use task_rq_lock()") Link: http://lkml.kernel.org/r/1427792017-7356-1-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 5e2f99bd5ce0..9d3ad6433d88 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -514,7 +514,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) unsigned long flags; struct rq *rq; - rq = task_rq_lock(current, &flags); + rq = task_rq_lock(p, &flags); /* * We need to take care of several possible races here: @@ -569,7 +569,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) push_dl_task(rq); #endif unlock: - task_rq_unlock(rq, current, &flags); + task_rq_unlock(rq, p, &flags); return HRTIMER_NORESTART; } -- cgit v1.3-8-gc7d7 From 3c18d447b3b36a8d3c90dc37dfbd363cdb685d0a Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Tue, 31 Mar 2015 09:53:37 +0100 Subject: sched/core: Check for available DL bandwidth in cpuset_cpu_inactive() Hotplug operations are destructive w.r.t. cpusets. In case such an operation is performed on a CPU belonging to an exlusive cpuset, the DL bandwidth information associated with the corresponding root domain is gone even if the operation fails (in sched_cpu_inactive()). For this reason we need to move the check we currently have in sched_cpu_inactive() to cpuset_cpu_inactive() to prevent useless cpusets reconfiguration in the CPU_DOWN_FAILED path. Signed-off-by: Juri Lelli Signed-off-by: Peter Zijlstra (Intel) Cc: Juri Lelli Link: http://lkml.kernel.org/r/1427792017-7356-2-git-send-email-juri.lelli@arm.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 56 ++++++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4c49e75ca24d..28b0d75a8273 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5337,36 +5337,13 @@ static int sched_cpu_active(struct notifier_block *nfb, static int sched_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long flags; - long cpu = (long)hcpu; - struct dl_bw *dl_b; - switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - set_cpu_active(cpu, false); - - /* explicitly allow suspend */ - if (!(action & CPU_TASKS_FROZEN)) { - bool overflow; - int cpus; - - rcu_read_lock_sched(); - dl_b = dl_bw_of(cpu); - - raw_spin_lock_irqsave(&dl_b->lock, flags); - cpus = dl_bw_cpus(cpu); - overflow = __dl_overflow(dl_b, cpus, 0, 0); - raw_spin_unlock_irqrestore(&dl_b->lock, flags); - - rcu_read_unlock_sched(); - - if (overflow) - return notifier_from_errno(-EBUSY); - } + set_cpu_active((long)hcpu, false); return NOTIFY_OK; + default: + return NOTIFY_DONE; } - - return NOTIFY_DONE; } static int __init migration_init(void) @@ -7006,7 +6983,6 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, */ case CPU_ONLINE: - case CPU_DOWN_FAILED: cpuset_update_active_cpus(true); break; default: @@ -7018,8 +6994,32 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, void *hcpu) { - switch (action) { + unsigned long flags; + long cpu = (long)hcpu; + struct dl_bw *dl_b; + + switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: + /* explicitly allow suspend */ + if (!(action & CPU_TASKS_FROZEN)) { + bool overflow; + int cpus; + + rcu_read_lock_sched(); + dl_b = dl_bw_of(cpu); + + raw_spin_lock_irqsave(&dl_b->lock, flags); + cpus = dl_bw_cpus(cpu); + overflow = __dl_overflow(dl_b, cpus, 0, 0); + raw_spin_unlock_irqrestore(&dl_b->lock, flags); + + rcu_read_unlock_sched(); + + if (overflow) { + trace_printk("hotplug failed for cpu %lu", cpu); + return notifier_from_errno(-EBUSY); + } + } cpuset_update_active_cpus(false); break; case CPU_DOWN_PREPARE_FROZEN: -- cgit v1.3-8-gc7d7 From fa9c9d10e97e38d9903fad1829535175ad261f45 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 27 Mar 2015 07:08:35 +0800 Subject: sched/deadline: Support DL task migration during CPU hotplug I observed that DL tasks can't be migrated to other CPUs during CPU hotplug, in addition, task may/may not be running again if CPU is added back. The root cause which I found is that DL tasks will be throtted and removed from the DL rq after comsuming all their budget, which leads to the situation that stop task can't pick them up from the DL rq and migrate them to other CPUs during hotplug. The method to reproduce: schedtool -E -t 50000:100000 -e ./test Actually './test' is just a simple for loop. Then observe which CPU the test task is on and offline it: echo 0 > /sys/devices/system/cpu/cpuN/online This patch adds the DL task migration during CPU hotplug by finding a most suitable later deadline rq after DL timer fires if current rq is offline. If it fails to find a suitable later deadline rq then it falls back to any eligible online CPU in so that the deadline task will come back to us, and the push/pull mechanism should then move it around properly. Suggested-and-Acked-by: Juri Lelli Signed-off-by: Wanpeng Li Signed-off-by: Peter Zijlstra (Intel) Link: http://lkml.kernel.org/r/1427411315-4298-1-git-send-email-wanpeng.li@linux.intel.com Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) (limited to 'kernel') diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9d3ad6433d88..5e95145088fd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -218,6 +218,52 @@ static inline void set_post_schedule(struct rq *rq) rq->post_schedule = has_pushable_dl_tasks(rq); } +static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq); + +static void dl_task_offline_migration(struct rq *rq, struct task_struct *p) +{ + struct rq *later_rq = NULL; + bool fallback = false; + + later_rq = find_lock_later_rq(p, rq); + + if (!later_rq) { + int cpu; + + /* + * If we cannot preempt any rq, fall back to pick any + * online cpu. + */ + fallback = true; + cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p)); + if (cpu >= nr_cpu_ids) { + /* + * Fail to find any suitable cpu. + * The task will never come back! + */ + BUG_ON(dl_bandwidth_enabled()); + + /* + * If admission control is disabled we + * try a little harder to let the task + * run. + */ + cpu = cpumask_any(cpu_active_mask); + } + later_rq = cpu_rq(cpu); + double_lock_balance(rq, later_rq); + } + + deactivate_task(rq, p, 0); + set_task_cpu(p, later_rq->cpu); + activate_task(later_rq, p, ENQUEUE_REPLENISH); + + if (!fallback) + resched_curr(later_rq); + + double_unlock_balance(rq, later_rq); +} + #else static inline @@ -536,6 +582,17 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) sched_clock_tick(); update_rq_clock(rq); +#ifdef CONFIG_SMP + /* + * If we find that the rq the task was on is no longer + * available, we need to select a new rq. + */ + if (unlikely(!rq->online)) { + dl_task_offline_migration(rq, p); + goto unlock; + } +#endif + /* * If the throttle happened during sched-out; like: * -- cgit v1.3-8-gc7d7 From b337a9380f7effd60d082569dd7e0b97a7549730 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Mar 2015 20:49:00 +0530 Subject: timer: Allocate per-cpu tvec_base's statically Memory for the 'tvec_base' array is allocated separately for the boot CPU (statically) and non-boot CPUs (dynamically). The reason is because __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've made NULL special, hint: lock_timer_base()) and we cannot get a compile time pointer to per-cpu entries because we don't know where we'll map the section, even for the boot cpu. This can be simplified a bit by statically allocating per-cpu memory. The only disadvantage is that memory for one of the structures will stay unused, i.e. for the boot CPU, which uses boot_tvec_bases. This will also guarantee that tvec_base is cacheline aligned. Even though tvec_base has ____cacheline_aligned stuck on, kzalloc_node() does not actually respect that (but guarantees a minimum u64 alignment). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/17cdf560f2727f687ab159707d0aa591f8a2f82d.1427814611.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 2d3f5c504939..f3cc653f876c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -90,8 +90,19 @@ struct tvec_base { struct tvec tv5; } ____cacheline_aligned; +/* + * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've + * made NULL special, hint: lock_timer_base()) and we cannot get a compile time + * pointer to per-cpu entries because we don't know where we'll map the section, + * even for the boot cpu. + * + * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the + * rest of them. + */ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); +static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); + static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; /* Functions below help us manage 'deferrable' flag */ @@ -1534,46 +1545,25 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible); static int init_timers_cpu(int cpu) { - int j; - struct tvec_base *base; + struct tvec_base *base = per_cpu(tvec_bases, cpu); static char tvec_base_done[NR_CPUS]; + int j; if (!tvec_base_done[cpu]) { - static char boot_done; + static char boot_cpu_skipped; - if (boot_done) { - /* - * The APs use this path later in boot - */ - base = kzalloc_node(sizeof(*base), GFP_KERNEL, - cpu_to_node(cpu)); - if (!base) - return -ENOMEM; - - /* Make sure tvec_base has TIMER_FLAG_MASK bits free */ - if (WARN_ON(base != tbase_get_base(base))) { - kfree(base); - return -ENOMEM; - } - per_cpu(tvec_bases, cpu) = base; + if (!boot_cpu_skipped) { + boot_cpu_skipped = 1; /* skip the boot cpu */ } else { - /* - * This is for the boot CPU - we use compile-time - * static initialisation because per-cpu memory isn't - * ready yet and because the memory allocators are not - * initialised either. - */ - boot_done = 1; - base = &boot_tvec_bases; + base = per_cpu_ptr(&__tvec_bases, cpu); + per_cpu(tvec_bases, cpu) = base; } + spin_lock_init(&base->lock); tvec_base_done[cpu] = 1; base->cpu = cpu; - } else { - base = per_cpu(tvec_bases, cpu); } - for (j = 0; j < TVN_SIZE; j++) { INIT_LIST_HEAD(base->tv5.vec + j); INIT_LIST_HEAD(base->tv4.vec + j); -- cgit v1.3-8-gc7d7 From 8def906044c02edcedac79aa3d6310ab4d90c4d8 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Tue, 31 Mar 2015 20:49:01 +0530 Subject: timer: Don't initialize 'tvec_base' on hotplug There is no need to call init_timers_cpu() on every CPU hotplug event, there is not much we need to reset. - Timer-lists are already empty at the end of migrate_timers(). - timer_jiffies will be refreshed while adding a new timer, after the CPU is online again. - active_timers and all_timers can be reset from migrate_timers(). Signed-off-by: Viresh Kumar Signed-off-by: Peter Zijlstra (Intel) Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/54a1c30ea7b805af55beb220cadf5a07a21b0a4d.1427814611.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 98 +++++++++++++++++++++++------------------------------ 1 file changed, 43 insertions(+), 55 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index f3cc653f876c..1feb9c7035c0 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1543,43 +1543,6 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); -static int init_timers_cpu(int cpu) -{ - struct tvec_base *base = per_cpu(tvec_bases, cpu); - static char tvec_base_done[NR_CPUS]; - int j; - - if (!tvec_base_done[cpu]) { - static char boot_cpu_skipped; - - if (!boot_cpu_skipped) { - boot_cpu_skipped = 1; /* skip the boot cpu */ - } else { - base = per_cpu_ptr(&__tvec_bases, cpu); - per_cpu(tvec_bases, cpu) = base; - } - - spin_lock_init(&base->lock); - tvec_base_done[cpu] = 1; - base->cpu = cpu; - } - - for (j = 0; j < TVN_SIZE; j++) { - INIT_LIST_HEAD(base->tv5.vec + j); - INIT_LIST_HEAD(base->tv4.vec + j); - INIT_LIST_HEAD(base->tv3.vec + j); - INIT_LIST_HEAD(base->tv2.vec + j); - } - for (j = 0; j < TVR_SIZE; j++) - INIT_LIST_HEAD(base->tv1.vec + j); - - base->timer_jiffies = jiffies; - base->next_timer = base->timer_jiffies; - base->active_timers = 0; - base->all_timers = 0; - return 0; -} - #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) { @@ -1621,6 +1584,9 @@ static void migrate_timers(int cpu) migrate_timer_list(new_base, old_base->tv5.vec + i); } + old_base->active_timers = 0; + old_base->all_timers = 0; + spin_unlock(&old_base->lock); spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); @@ -1630,25 +1596,16 @@ static void migrate_timers(int cpu) static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { - long cpu = (long)hcpu; - int err; - - switch(action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = init_timers_cpu(cpu); - if (err < 0) - return notifier_from_errno(err); - break; #ifdef CONFIG_HOTPLUG_CPU + switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: - migrate_timers(cpu); + migrate_timers((long)hcpu); break; -#endif default: break; } +#endif return NOTIFY_OK; } @@ -1656,18 +1613,49 @@ static struct notifier_block timers_nb = { .notifier_call = timer_cpu_notify, }; +static void __init init_timer_cpu(struct tvec_base *base, int cpu) +{ + int j; -void __init init_timers(void) + base->cpu = cpu; + per_cpu(tvec_bases, cpu) = base; + spin_lock_init(&base->lock); + + for (j = 0; j < TVN_SIZE; j++) { + INIT_LIST_HEAD(base->tv5.vec + j); + INIT_LIST_HEAD(base->tv4.vec + j); + INIT_LIST_HEAD(base->tv3.vec + j); + INIT_LIST_HEAD(base->tv2.vec + j); + } + for (j = 0; j < TVR_SIZE; j++) + INIT_LIST_HEAD(base->tv1.vec + j); + + base->timer_jiffies = jiffies; + base->next_timer = base->timer_jiffies; +} + +static void __init init_timer_cpus(void) { - int err; + struct tvec_base *base; + int local_cpu = smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == local_cpu) + base = &boot_tvec_bases; + else + base = per_cpu_ptr(&__tvec_bases, cpu); + + init_timer_cpu(base, cpu); + } +} +void __init init_timers(void) +{ /* ensure there are enough low bits for flags in timer->base pointer */ BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK); - err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - BUG_ON(err != NOTIFY_OK); - + init_timer_cpus(); init_timer_stats(); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); -- cgit v1.3-8-gc7d7 From 3650b57fdf208bc0e36cbe7b5e0744bd0e0cf34d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 31 Mar 2015 20:49:02 +0530 Subject: timer: Further simplify the SMP and HOTPLUG logic Remove one CONFIG_HOTPLUG_CPU #ifdef in trade for introducing one CONFIG_SMP #ifdef. The CONFIG_SMP ifdef avoids declaring the per-CPU __tvec_bases storage on UP systems since they already have boot_tvec_bases. Also (re)add a runtime check on the base alignment -- for the paranoid amongst us :-) Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fdd2d35e169bdc554ffa3fe77f77716298c75ada.1427814611.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timer.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1feb9c7035c0..2ece3aa5069c 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -101,7 +101,6 @@ struct tvec_base { */ struct tvec_base boot_tvec_bases; EXPORT_SYMBOL(boot_tvec_bases); -static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; @@ -1038,6 +1037,8 @@ int try_to_del_timer_sync(struct timer_list *timer) EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct tvec_base, __tvec_bases); + /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated @@ -1591,12 +1592,10 @@ static void migrate_timers(int cpu) spin_unlock_irq(&new_base->lock); put_cpu_var(tvec_bases); } -#endif /* CONFIG_HOTPLUG_CPU */ static int timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { -#ifdef CONFIG_HOTPLUG_CPU switch (action) { case CPU_DEAD: case CPU_DEAD_FROZEN: @@ -1605,18 +1604,24 @@ static int timer_cpu_notify(struct notifier_block *self, default: break; } -#endif + return NOTIFY_OK; } -static struct notifier_block timers_nb = { - .notifier_call = timer_cpu_notify, -}; +static inline void timer_register_cpu_notifier(void) +{ + cpu_notifier(timer_cpu_notify, 0); +} +#else +static inline void timer_register_cpu_notifier(void) { } +#endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(struct tvec_base *base, int cpu) { int j; + BUG_ON(base != tbase_get_base(base)); + base->cpu = cpu; per_cpu(tvec_bases, cpu) = base; spin_lock_init(&base->lock); @@ -1643,8 +1648,10 @@ static void __init init_timer_cpus(void) for_each_possible_cpu(cpu) { if (cpu == local_cpu) base = &boot_tvec_bases; +#ifdef CONFIG_SMP else base = per_cpu_ptr(&__tvec_bases, cpu); +#endif init_timer_cpu(base, cpu); } @@ -1657,7 +1664,7 @@ void __init init_timers(void) init_timer_cpus(); init_timer_stats(); - register_cpu_notifier(&timers_nb); + timer_register_cpu_notifier(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } -- cgit v1.3-8-gc7d7 From 00ccbf2f5b7580cd7dcdaeda84828d14f0cba3c9 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Thu, 19 Feb 2015 15:56:14 +0100 Subject: ftrace/x86: Let dynamic trampolines call ops->func even for dynamic fops Dynamically allocated trampolines call ftrace_ops_get_func to get the function which they should call. For dynamic fops (FTRACE_OPS_FL_DYNAMIC flag is set) ftrace_ops_list_func is always returned. This is reasonable for static trampolines but goes against the main advantage of dynamic ones, that is avoidance of going through the list of all registered callbacks for functions that are only being traced by a single callback. We can fix it by returning ops->func (or recursion safe version) from ftrace_ops_get_func whenever it is possible for dynamic trampolines. Note that dynamic trampolines are not allowed for dynamic fops if CONFIG_PREEMPT=y. Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1501291023000.25445@pobox.suse.cz Link: http://lkml.kernel.org/r/1424357773-13536-1-git-send-email-mbenes@suse.cz Reported-by: Miroslav Benes Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4f228024055b..d01d238d8ef4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -249,6 +249,19 @@ static void update_function_graph_func(void); static inline void update_function_graph_func(void) { } #endif + +static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) +{ + /* + * If this is a dynamic ops or we force list func, + * then it needs to call the list anyway. + */ + if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + return ftrace_ops_list_func; + + return ftrace_ops_get_func(ops); +} + static void update_ftrace_function(void) { ftrace_func_t func; @@ -270,7 +283,7 @@ static void update_ftrace_function(void) * then have the mcount trampoline call the function directly. */ } else if (ftrace_ops_list->next == &ftrace_list_end) { - func = ftrace_ops_get_func(ftrace_ops_list); + func = ftrace_ops_get_list_func(ftrace_ops_list); } else { /* Just use the default ftrace_ops */ @@ -5208,13 +5221,6 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, */ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { - /* - * If this is a dynamic ops or we force list func, - * then it needs to call the list anyway. - */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) - return ftrace_ops_list_func; - /* * If the func handles its own recursion, call it directly. * Otherwise call the recursion protected function that -- cgit v1.3-8-gc7d7 From 9a806ddbb9a18c510e4acdcc828b9a87f5fd3aef Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 1 Apr 2015 20:34:21 -0700 Subject: time: Add y2038 safe read_boot_clock64() As part of addressing in-kernel y2038 issues, this patch adds read_boot_clock64() and replaces all the call sites of read_boot_clock() with this function. This is a __weak implementation, which simply calls the existing y2038 unsafe read_boot_clock(). This allows architecture specific implementations to be converted independently, and eventually the y2038 unsafe read_boot_clock() can be removed after all its architecture specific implementations have been converted to read_boot_clock64(). Suggested-by: Arnd Bergmann Signed-off-by: Xunlei Pang Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-2-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 11 +++++++++-- 2 files changed, 10 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 5047b83483d6..18d27a3f72ca 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -269,6 +269,7 @@ static inline bool has_persistent_clock(void) extern void read_persistent_clock(struct timespec *ts); extern void read_boot_clock(struct timespec *ts); +extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock(struct timespec now); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b12292b343a..652e50a9c6ed 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1188,6 +1188,14 @@ void __weak read_boot_clock(struct timespec *ts) ts->tv_nsec = 0; } +void __weak read_boot_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_boot_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ @@ -1209,8 +1217,7 @@ void __init timekeeping_init(void) } else if (now.tv_sec || now.tv_nsec) persistent_clock_exist = true; - read_boot_clock(&ts); - boot = timespec_to_timespec64(ts); + read_boot_clock64(&boot); if (!timespec64_valid_strict(&boot)) { pr_warn("WARNING: Boot clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); -- cgit v1.3-8-gc7d7 From 2ee966320028ac846654eba5344540eeb4dc228d Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 1 Apr 2015 20:34:22 -0700 Subject: time: Add y2038 safe read_persistent_clock64() As part of addressing in-kernel y2038 issues, this patch adds read_persistent_clock64() and replaces all the call sites of read_persistent_clock() with this function. This is a __weak implementation, which simply calls the existing y2038 unsafe read_persistent_clock(). This allows architecture specific implementations to be converted independently, and eventually the y2038 unsafe read_persistent_clock() can be removed after all its architecture specific implementations have been converted to read_persistent_clock64(). Suggested-by: Arnd Bergmann Signed-off-by: Xunlei Pang Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-3-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- arch/mips/lasat/sysctl.c | 4 ++-- include/linux/timekeeping.h | 1 + kernel/time/timekeeping.c | 22 ++++++++++++---------- 3 files changed, 15 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 3b7f65cc4218..cf9b4633257e 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -75,11 +75,11 @@ static int rtctmp; int proc_dolasatrtc(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - struct timespec ts; + struct timespec64 ts; int r; if (!write) { - read_persistent_clock(&ts); + read_persistent_clock64(&ts); rtctmp = ts.tv_sec; /* check for time < 0 and set to 0 */ if (rtctmp < 0) diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 18d27a3f72ca..4c0f76f4616c 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -268,6 +268,7 @@ static inline bool has_persistent_clock(void) } extern void read_persistent_clock(struct timespec *ts); +extern void read_persistent_clock64(struct timespec64 *ts); extern void read_boot_clock(struct timespec *ts); extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock(struct timespec now); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 652e50a9c6ed..b1dbfa573dce 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1173,6 +1173,14 @@ void __weak read_persistent_clock(struct timespec *ts) ts->tv_nsec = 0; } +void __weak read_persistent_clock64(struct timespec64 *ts64) +{ + struct timespec ts; + + read_persistent_clock(&ts); + *ts64 = timespec_to_timespec64(ts); +} + /** * read_boot_clock - Return time of the system start. * @@ -1205,10 +1213,8 @@ void __init timekeeping_init(void) struct clocksource *clock; unsigned long flags; struct timespec64 now, boot, tmp; - struct timespec ts; - read_persistent_clock(&ts); - now = timespec_to_timespec64(ts); + read_persistent_clock64(&now); if (!timespec64_valid_strict(&now)) { pr_warn("WARNING: Persistent clock returned invalid value!\n" " Check your CMOS/BIOS settings.\n"); @@ -1278,7 +1284,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * - * This hook is for architectures that cannot support read_persistent_clock + * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. * * This function should only be called by rtc_resume(), and allows @@ -1325,12 +1331,10 @@ void timekeeping_resume(void) struct clocksource *clock = tk->tkr_mono.clock; unsigned long flags; struct timespec64 ts_new, ts_delta; - struct timespec tmp; cycle_t cycle_now, cycle_delta; bool suspendtime_found = false; - read_persistent_clock(&tmp); - ts_new = timespec_to_timespec64(tmp); + read_persistent_clock64(&ts_new); clockevents_resume(); clocksource_resume(); @@ -1406,10 +1410,8 @@ int timekeeping_suspend(void) unsigned long flags; struct timespec64 delta, delta_delta; static struct timespec64 old_delta; - struct timespec tmp; - read_persistent_clock(&tmp); - timekeeping_suspend_time = timespec_to_timespec64(tmp); + read_persistent_clock64(&timekeeping_suspend_time); /* * On some systems the persistent_clock can not be detected at -- cgit v1.3-8-gc7d7 From 3c00a1fe8496ff29ab62764bb3f4ce4b48089004 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 1 Apr 2015 20:34:23 -0700 Subject: time: Add y2038 safe update_persistent_clock64() As part of addressing in-kernel y2038 issues, this patch adds update_persistent_clock64() and replaces all the call sites of update_persistent_clock() with this function. This is a __weak implementation, which simply calls the existing y2038 unsafe update_persistent_clock(). This allows architecture specific implementations to be converted independently, and eventually y2038-unsafe update_persistent_clock() can be removed after all its architecture specific implementations have been converted to update_persistent_clock64(). Suggested-by: Arnd Bergmann Signed-off-by: Xunlei Pang Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-4-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- drivers/rtc/systohc.c | 2 +- include/linux/timekeeping.h | 1 + kernel/time/ntp.c | 13 ++++++++++++- 3 files changed, 14 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c index eb71872d0361..ef3c07a52c3a 100644 --- a/drivers/rtc/systohc.c +++ b/drivers/rtc/systohc.c @@ -11,7 +11,7 @@ * rtc_set_ntp_time - Save NTP synchronized time to the RTC * @now: Current time of day * - * Replacement for the NTP platform function update_persistent_clock + * Replacement for the NTP platform function update_persistent_clock64 * that stores time for later retrieval by rtc_hctosys. * * Returns 0 on successful RTC update, -ENODEV if a RTC update is not diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 4c0f76f4616c..7a2369d5b3f4 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -272,6 +272,7 @@ extern void read_persistent_clock64(struct timespec64 *ts); extern void read_boot_clock(struct timespec *ts); extern void read_boot_clock64(struct timespec64 *ts); extern int update_persistent_clock(struct timespec now); +extern int update_persistent_clock64(struct timespec64 now); #endif diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 9ad60d028508..7a681003001c 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -458,6 +458,16 @@ out: return leap; } +#ifdef CONFIG_GENERIC_CMOS_UPDATE +int __weak update_persistent_clock64(struct timespec64 now64) +{ + struct timespec now; + + now = timespec64_to_timespec(now64); + return update_persistent_clock(now); +} +#endif + #if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) static void sync_cmos_clock(struct work_struct *work); @@ -493,8 +503,9 @@ static void sync_cmos_clock(struct work_struct *work) if (persistent_clock_is_local) adjust.tv_sec -= (sys_tz.tz_minuteswest * 60); #ifdef CONFIG_GENERIC_CMOS_UPDATE - fail = update_persistent_clock(timespec64_to_timespec(adjust)); + fail = update_persistent_clock64(adjust); #endif + #ifdef CONFIG_RTC_SYSTOHC if (fail == -ENODEV) fail = rtc_set_ntp_time(adjust); -- cgit v1.3-8-gc7d7 From 7f2981393af31a854879f2496cab4c978e886902 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 1 Apr 2015 20:34:35 -0700 Subject: time: Don't build timekeeping_inject_sleeptime64() if no one uses it timekeeping_inject_sleeptime64() is only used by RTC suspend/resume, so add build dependencies on the necessary RTC related macros. Signed-off-by: Xunlei Pang [ Improve commit message clarity. ] Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-16-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b1dbfa573dce..3be559b6fd0a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1280,6 +1280,7 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, tk_debug_account_sleep_time(delta); } +#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value @@ -1317,6 +1318,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) /* signal hrtimers about time change */ clock_was_set(); } +#endif /** * timekeeping_resume - Resumes the generic timekeeping subsystem. -- cgit v1.3-8-gc7d7 From 264bb3f79f2a465477cdcd2f0554e21aedc443a3 Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 1 Apr 2015 20:34:37 -0700 Subject: time: Fix a bug in timekeeping_suspend() with no persistent clock When there's no persistent clock, normally timekeeping_suspend_time should always be zero, but this can break in timekeeping_suspend(). At T1, there was a system suspend, so old_delta was assigned T1. After some time, one time adjustment happened, and xtime got the value of T1-dt(0s Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-18-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 3be559b6fd0a..b7db4916415b 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1255,7 +1255,7 @@ void __init timekeeping_init(void) raw_spin_unlock_irqrestore(&timekeeper_lock, flags); } -/* time in seconds when suspend began */ +/* time in seconds when suspend began for persistent clock */ static struct timespec64 timekeeping_suspend_time; /** @@ -1428,24 +1428,26 @@ int timekeeping_suspend(void) timekeeping_forward_now(tk); timekeeping_suspended = 1; - /* - * To avoid drift caused by repeated suspend/resumes, - * which each can add ~1 second drift error, - * try to compensate so the difference in system time - * and persistent_clock time stays close to constant. - */ - delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); - delta_delta = timespec64_sub(delta, old_delta); - if (abs(delta_delta.tv_sec) >= 2) { + if (has_persistent_clock()) { /* - * if delta_delta is too large, assume time correction - * has occured and set old_delta to the current delta. + * To avoid drift caused by repeated suspend/resumes, + * which each can add ~1 second drift error, + * try to compensate so the difference in system time + * and persistent_clock time stays close to constant. */ - old_delta = delta; - } else { - /* Otherwise try to adjust old_system to compensate */ - timekeeping_suspend_time = - timespec64_add(timekeeping_suspend_time, delta_delta); + delta = timespec64_sub(tk_xtime(tk), timekeeping_suspend_time); + delta_delta = timespec64_sub(delta, old_delta); + if (abs(delta_delta.tv_sec) >= 2) { + /* + * if delta_delta is too large, assume time correction + * has occurred and set old_delta to the current delta. + */ + old_delta = delta; + } else { + /* Otherwise try to adjust old_system to compensate */ + timekeeping_suspend_time = + timespec64_add(timekeeping_suspend_time, delta_delta); + } } timekeeping_update(tk, TK_MIRROR); -- cgit v1.3-8-gc7d7 From 0fa88cb4b82b5cf7429bc1cef9db006ca035754e Mon Sep 17 00:00:00 2001 From: Xunlei Pang Date: Wed, 1 Apr 2015 20:34:38 -0700 Subject: time, drivers/rtc: Don't bother with rtc_resume() for the nonstop clocksource If a system does not provide a persistent_clock(), the time will be updated on resume by rtc_resume(). With the addition of the non-stop clocksources for suspend timing, those systems set the time on resume in timekeeping_resume(), but may not provide a valid persistent_clock(). This results in the rtc_resume() logic thinking no one has set the time and it then will over-write the suspend time again, which is not necessary and only increases clock error. So, fix this for rtc_resume(). This patch also improves the name of persistent_clock_exist to make it more grammatical. Signed-off-by: Xunlei Pang Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-19-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- drivers/rtc/class.c | 4 +-- include/linux/timekeeping.h | 9 +++---- kernel/time/timekeeping.c | 66 +++++++++++++++++++++++++++++++++------------ 3 files changed, 54 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c index d40760aa4553..c29ba7e14304 100644 --- a/drivers/rtc/class.c +++ b/drivers/rtc/class.c @@ -55,7 +55,7 @@ static int rtc_suspend(struct device *dev) struct timespec64 delta, delta_delta; int err; - if (has_persistent_clock()) + if (timekeeping_rtc_skipsuspend()) return 0; if (strcmp(dev_name(&rtc->dev), CONFIG_RTC_HCTOSYS_DEVICE) != 0) @@ -102,7 +102,7 @@ static int rtc_resume(struct device *dev) struct timespec64 sleep_time; int err; - if (has_persistent_clock()) + if (timekeeping_rtc_skipresume()) return 0; rtc_hctosys_ret = -ENODEV; diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index 7a2369d5b3f4..99176af216af 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -248,6 +248,9 @@ static inline void timekeeping_clocktai(struct timespec *ts) /* * RTC specific */ +extern bool timekeeping_rtc_skipsuspend(void); +extern bool timekeeping_rtc_skipresume(void); + extern void timekeeping_inject_sleeptime64(struct timespec64 *delta); /* @@ -259,14 +262,8 @@ extern void getnstime_raw_and_real(struct timespec *ts_raw, /* * Persistent clock related interfaces */ -extern bool persistent_clock_exist; extern int persistent_clock_is_local; -static inline bool has_persistent_clock(void) -{ - return persistent_clock_exist; -} - extern void read_persistent_clock(struct timespec *ts); extern void read_persistent_clock64(struct timespec64 *ts); extern void read_boot_clock(struct timespec *ts); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index b7db4916415b..79b9bc6e7876 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -64,9 +64,6 @@ static struct tk_fast tk_fast_raw ____cacheline_aligned; /* flag for if timekeeping is suspended */ int __read_mostly timekeeping_suspended; -/* Flag for if there is a persistent clock on this platform */ -bool __read_mostly persistent_clock_exist = false; - static inline void tk_normalize_xtime(struct timekeeper *tk) { while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) { @@ -1204,6 +1201,12 @@ void __weak read_boot_clock64(struct timespec64 *ts64) *ts64 = timespec_to_timespec64(ts); } +/* Flag for if timekeeping_resume() has injected sleeptime */ +static bool sleeptime_injected; + +/* Flag for if there is a persistent clock on this platform */ +static bool persistent_clock_exists; + /* * timekeeping_init - Initializes the clocksource and common timekeeping values */ @@ -1221,7 +1224,7 @@ void __init timekeeping_init(void) now.tv_sec = 0; now.tv_nsec = 0; } else if (now.tv_sec || now.tv_nsec) - persistent_clock_exist = true; + persistent_clock_exists = true; read_boot_clock64(&boot); if (!timespec64_valid_strict(&boot)) { @@ -1281,12 +1284,48 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, } #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_RTC_HCTOSYS_DEVICE) +/** + * We have three kinds of time sources to use for sleep time + * injection, the preference order is: + * 1) non-stop clocksource + * 2) persistent clock (ie: RTC accessible when irqs are off) + * 3) RTC + * + * 1) and 2) are used by timekeeping, 3) by RTC subsystem. + * If system has neither 1) nor 2), 3) will be used finally. + * + * + * If timekeeping has injected sleeptime via either 1) or 2), + * 3) becomes needless, so in this case we don't need to call + * rtc_resume(), and this is what timekeeping_rtc_skipresume() + * means. + */ +bool timekeeping_rtc_skipresume(void) +{ + return sleeptime_injected; +} + +/** + * 1) can be determined whether to use or not only when doing + * timekeeping_resume() which is invoked after rtc_suspend(), + * so we can't skip rtc_suspend() surely if system has 1). + * + * But if system has 2), 2) will definitely be used, so in this + * case we don't need to call rtc_suspend(), and this is what + * timekeeping_rtc_skipsuspend() means. + */ +bool timekeeping_rtc_skipsuspend(void) +{ + return persistent_clock_exists; +} + /** * timekeeping_inject_sleeptime64 - Adds suspend interval to timeekeeping values * @delta: pointer to a timespec64 delta value * * This hook is for architectures that cannot support read_persistent_clock64 * because their RTC/persistent clock is only accessible when irqs are enabled. + * and also don't have an effective nonstop clocksource. * * This function should only be called by rtc_resume(), and allows * a suspend offset to be injected into the timekeeping values. @@ -1296,13 +1335,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) struct timekeeper *tk = &tk_core.timekeeper; unsigned long flags; - /* - * Make sure we don't set the clock twice, as timekeeping_resume() - * already did it - */ - if (has_persistent_clock()) - return; - raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); @@ -1334,8 +1366,8 @@ void timekeeping_resume(void) unsigned long flags; struct timespec64 ts_new, ts_delta; cycle_t cycle_now, cycle_delta; - bool suspendtime_found = false; + sleeptime_injected = false; read_persistent_clock64(&ts_new); clockevents_resume(); @@ -1381,13 +1413,13 @@ void timekeeping_resume(void) nsec += ((u64) cycle_delta * mult) >> shift; ts_delta = ns_to_timespec64(nsec); - suspendtime_found = true; + sleeptime_injected = true; } else if (timespec64_compare(&ts_new, &timekeeping_suspend_time) > 0) { ts_delta = timespec64_sub(ts_new, timekeeping_suspend_time); - suspendtime_found = true; + sleeptime_injected = true; } - if (suspendtime_found) + if (sleeptime_injected) __timekeeping_inject_sleeptime(tk, &ts_delta); /* Re-base the last cycle value */ @@ -1421,14 +1453,14 @@ int timekeeping_suspend(void) * value returned, update the persistent_clock_exists flag. */ if (timekeeping_suspend_time.tv_sec || timekeeping_suspend_time.tv_nsec) - persistent_clock_exist = true; + persistent_clock_exists = true; raw_spin_lock_irqsave(&timekeeper_lock, flags); write_seqcount_begin(&tk_core.seq); timekeeping_forward_now(tk); timekeeping_suspended = 1; - if (has_persistent_clock()) { + if (persistent_clock_exists) { /* * To avoid drift caused by repeated suspend/resumes, * which each can add ~1 second drift error, -- cgit v1.3-8-gc7d7 From 8e56f33f8439b2f8e7f4ae7f3d0bfe683ecc3b09 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 1 Apr 2015 20:34:39 -0700 Subject: clocksource: Improve comment explaining clocks_calc_max_nsecs()'s 50% safety margin Ingo noted that the description of clocks_calc_max_nsecs()'s 50% safety margin was somewhat circular. So this patch tries to improve the comment to better explain what we mean by the 50% safety margin and why we need it. Signed-off-by: John Stultz Cc: Peter Zijlstra Cc: Prarit Bhargava Cc: Richard Cochran Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1427945681-29972-20-git-send-email-john.stultz@linaro.org Signed-off-by: Ingo Molnar --- kernel/time/clocksource.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c3be3c71bbad..15facb1b9c60 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -472,8 +472,11 @@ static u32 clocksource_max_adjustment(struct clocksource *cs) * @max_cyc: maximum cycle value before potential overflow (does not include * any safety margin) * - * NOTE: This function includes a safety margin of 50%, so that bad clock values - * can be detected. + * NOTE: This function includes a safety margin of 50%, in other words, we + * return half the number of nanoseconds the hardware counter can technically + * cover. This is done so that we can potentially detect problems caused by + * delayed timers or bad hardware, which might result in time intervals that + * are larger then what the math used can handle without overflows. */ u64 clocks_calc_max_nsecs(u32 mult, u32 shift, u32 maxadj, u64 mask, u64 *max_cyc) { -- cgit v1.3-8-gc7d7 From 592a438ff3fea61d303c5784c209b3f1fd3e16df Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:01:10 +0200 Subject: clockevents: Provide explicit broadcast control functions clockevents_notify() is a leftover from the early design of the clockevents facility. It's really not a notification mechanism, it's a multiplex call. We are way better off to have explicit calls instead of this monstrosity. Split out the broadcast control into a separate function and provide inline helpers. Switch clockevents_notify() over. This will go away once all callers are converted. This also gets rid of the nested locking of clockevents_lock and broadcast_lock. The broadcast control functions do not require clockevents_lock. Only the managing functions (setup/shutdown/suspend/resume of the broadcast device require clockevents_lock. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Daniel Lezcano Cc: Len Brown Cc: Peter Zijlstra Cc: Tony Lindgren Link: http://lkml.kernel.org/r/8086559.ttsuS0n1Xr@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/tick.h | 25 ++++++++++++++++++ kernel/time/clockevents.c | 6 ++++- kernel/time/tick-broadcast.c | 62 +++++++++++++++++++------------------------- kernel/time/tick-internal.h | 2 -- 4 files changed, 57 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index f9ff225d53c0..4bb236357b03 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -42,6 +42,31 @@ extern void hotplug_cpu__broadcast_tick_pull(int dead_cpu); static inline void hotplug_cpu__broadcast_tick_pull(int dead_cpu) { } #endif +enum tick_broadcast_mode { + TICK_BROADCAST_OFF, + TICK_BROADCAST_ON, + TICK_BROADCAST_FORCE, +}; + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST +extern void tick_broadcast_control(enum tick_broadcast_mode mode); +#else +static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } +#endif /* BROADCAST */ + +static inline void tick_broadcast_enable(void) +{ + tick_broadcast_control(TICK_BROADCAST_ON); +} +static inline void tick_broadcast_disable(void) +{ + tick_broadcast_control(TICK_BROADCAST_OFF); +} +static inline void tick_broadcast_force(void) +{ + tick_broadcast_control(TICK_BROADCAST_FORCE); +} + #ifdef CONFIG_NO_HZ_COMMON extern int tick_nohz_tick_stopped(void); extern void tick_nohz_idle_enter(void); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7af614829da1..599ff8d3fda5 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -656,9 +656,13 @@ int clockevents_notify(unsigned long reason, void *arg) switch (reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: + tick_broadcast_enable(); + break; case CLOCK_EVT_NOTIFY_BROADCAST_OFF: + tick_broadcast_disable(); + break; case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_on_off(reason, arg); + tick_broadcast_force(); break; case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f5e0fd5652dc..1a0bee04ef8c 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -33,7 +33,7 @@ static cpumask_var_t tick_broadcast_mask; static cpumask_var_t tick_broadcast_on; static cpumask_var_t tmpmask; static DEFINE_RAW_SPINLOCK(tick_broadcast_lock); -static int tick_broadcast_force; +static int tick_broadcast_forced; #ifdef CONFIG_TICK_ONESHOT static void tick_broadcast_clear_oneshot(int cpu); @@ -326,49 +326,54 @@ unlock: raw_spin_unlock(&tick_broadcast_lock); } -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop +/** + * tick_broadcast_control - Enable/disable or force broadcast mode + * @mode: The selected broadcast mode + * + * Called when the system enters a state where affected tick devices + * might stop. Note: TICK_BROADCAST_FORCE cannot be undone. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. */ -static void tick_do_broadcast_on_off(unsigned long *reason) +void tick_broadcast_control(enum tick_broadcast_mode mode) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags; int cpu, bc_stopped; - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); + td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; - bc = tick_broadcast_device.evtdev; /* * Is the device not affected by the powerstate ? */ if (!dev || !(dev->features & CLOCK_EVT_FEAT_C3STOP)) - goto out; + return; if (!tick_device_is_functional(dev)) - goto out; + return; + raw_spin_lock(&tick_broadcast_lock); + cpu = smp_processor_id(); + bc = tick_broadcast_device.evtdev; bc_stopped = cpumask_empty(tick_broadcast_mask); - switch (*reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: + switch (mode) { + case TICK_BROADCAST_FORCE: + tick_broadcast_forced = 1; + case TICK_BROADCAST_ON: cpumask_set_cpu(cpu, tick_broadcast_on); if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_mask)) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) clockevents_shutdown(dev); } - if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) - tick_broadcast_force = 1; break; - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - if (tick_broadcast_force) + + case TICK_BROADCAST_OFF: + if (tick_broadcast_forced) break; cpumask_clear_cpu(cpu, tick_broadcast_on); if (!tick_device_is_functional(dev)) @@ -390,22 +395,9 @@ static void tick_do_broadcast_on_off(unsigned long *reason) else tick_broadcast_setup_oneshot(bc); } -out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop. - */ -void tick_broadcast_on_off(unsigned long reason, int *oncpu) -{ - if (!cpumask_test_cpu(*oncpu, cpu_online_mask)) - printk(KERN_ERR "tick-broadcast: ignoring broadcast for " - "offline CPU #%d\n", *oncpu); - else - tick_do_broadcast_on_off(&reason); + raw_spin_unlock(&tick_broadcast_lock); } +EXPORT_SYMBOL_GPL(tick_broadcast_control); /* * Set the periodic handler depending on broadcast on/off diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index b6ba0a44e740..62e331d1bc76 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -53,7 +53,6 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_broadcast_on_off(unsigned long reason, int *oncpu); extern void tick_shutdown_broadcast(unsigned int *cpup); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); @@ -68,7 +67,6 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { } static inline void tick_shutdown_broadcast(unsigned int *cpup) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } -- cgit v1.3-8-gc7d7 From 89feddbfe7023ccfb4a6d7f5e3f5161d91b28b18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:03:42 +0200 Subject: clockevents: Remove the broadcast control leftovers All users converted. Remove the notify leftovers. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/2076318.76XJZ8QYP3@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 3 --- kernel/time/clockevents.c | 10 ---------- 2 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index caeca76d7b32..438fafa75067 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -11,9 +11,6 @@ /* Clock event notification values */ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_ADD, - CLOCK_EVT_NOTIFY_BROADCAST_ON, - CLOCK_EVT_NOTIFY_BROADCAST_OFF, - CLOCK_EVT_NOTIFY_BROADCAST_FORCE, CLOCK_EVT_NOTIFY_BROADCAST_ENTER, CLOCK_EVT_NOTIFY_BROADCAST_EXIT, CLOCK_EVT_NOTIFY_CPU_DYING, diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 599ff8d3fda5..dba0b83708b3 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -655,16 +655,6 @@ int clockevents_notify(unsigned long reason, void *arg) raw_spin_lock_irqsave(&clockevents_lock, flags); switch (reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ON: - tick_broadcast_enable(); - break; - case CLOCK_EVT_NOTIFY_BROADCAST_OFF: - tick_broadcast_disable(); - break; - case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: - tick_broadcast_force(); - break; - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: ret = tick_broadcast_oneshot_control(reason); -- cgit v1.3-8-gc7d7 From 1fe5d5c3c9ba0c4ade18e3325cba0ffe35127941 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:05:15 +0200 Subject: clockevents: Provide explicit broadcast oneshot control functions clockevents_notify() is a leftover from the early design of the clockevents facility. It's really not a notification mechanism, it's a multiplex call. We are way better off to have explicit calls instead of this monstrosity. Split out the broadcast oneshot control into a separate function and provide inline helpers. Switch clockevents_notify() over. This will go away once all callers are converted. This also gets rid of the nested locking of clockevents_lock and broadcast_lock. The broadcast oneshot control functions do not require clockevents_lock. Only the managing functions (setup/shutdown/suspend/resume of the broadcast device require clockevents_lock. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Alexandre Courbot Cc: Daniel Lezcano Cc: Len Brown Cc: Peter Zijlstra Cc: Stephen Warren Cc: Thierry Reding Cc: Tony Lindgren Link: http://lkml.kernel.org/r/13000649.8qZuEDV0OA@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/tick.h | 19 +++++++++++++++++++ kernel/time/clockevents.c | 4 +++- kernel/time/tick-broadcast.c | 28 +++++++++++++++++----------- kernel/time/tick-internal.h | 2 -- 4 files changed, 39 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/include/linux/tick.h b/include/linux/tick.h index 4bb236357b03..6119321fc3be 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -48,12 +48,23 @@ enum tick_broadcast_mode { TICK_BROADCAST_FORCE, }; +enum tick_broadcast_state { + TICK_BROADCAST_EXIT, + TICK_BROADCAST_ENTER, +}; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST extern void tick_broadcast_control(enum tick_broadcast_mode mode); #else static inline void tick_broadcast_control(enum tick_broadcast_mode mode) { } #endif /* BROADCAST */ +#if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) +extern int tick_broadcast_oneshot_control(enum tick_broadcast_state state); +#else +static inline int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { return 0; } +#endif + static inline void tick_broadcast_enable(void) { tick_broadcast_control(TICK_BROADCAST_ON); @@ -66,6 +77,14 @@ static inline void tick_broadcast_force(void) { tick_broadcast_control(TICK_BROADCAST_FORCE); } +static inline int tick_broadcast_enter(void) +{ + return tick_broadcast_oneshot_control(TICK_BROADCAST_ENTER); +} +static inline void tick_broadcast_exit(void) +{ + tick_broadcast_oneshot_control(TICK_BROADCAST_EXIT); +} #ifdef CONFIG_NO_HZ_COMMON extern int tick_nohz_tick_stopped(void); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index dba0b83708b3..7791b1c94ef2 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -656,8 +656,10 @@ int clockevents_notify(unsigned long reason, void *arg) switch (reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: + tick_broadcast_enter(); + break; case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - ret = tick_broadcast_oneshot_control(reason); + tick_broadcast_exit(); break; case CLOCK_EVT_NOTIFY_CPU_DYING: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 1a0bee04ef8c..55e43f20987a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -687,18 +687,23 @@ void hotplug_cpu__broadcast_tick_pull(int deadcpu) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -/* - * Powerstate information: The system enters/leaves a state, where - * affected devices might stop +/** + * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode + * @state: The target state (enter/exit) + * + * The system enters/leaves a state, where affected devices might stop * Returns 0 on success, -EBUSY if the cpu is used to broadcast wakeups. + * + * Called with interrupts disabled, so clockevents_lock is not + * required here because the local clock event device cannot go away + * under us. */ -int tick_broadcast_oneshot_control(unsigned long reason) +int tick_broadcast_oneshot_control(enum tick_broadcast_state state) { struct clock_event_device *bc, *dev; struct tick_device *td; - unsigned long flags; - ktime_t now; int cpu, ret = 0; + ktime_t now; /* * Periodic mode does not care about the enter/exit of power @@ -711,17 +716,17 @@ int tick_broadcast_oneshot_control(unsigned long reason) * We are called with preemtion disabled from the depth of the * idle code, so we can't be moved away. */ - cpu = smp_processor_id(); - td = &per_cpu(tick_cpu_device, cpu); + td = this_cpu_ptr(&tick_cpu_device); dev = td->evtdev; if (!(dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; + raw_spin_lock(&tick_broadcast_lock); bc = tick_broadcast_device.evtdev; + cpu = smp_processor_id(); - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) { + if (state == TICK_BROADCAST_ENTER) { if (!cpumask_test_and_set_cpu(cpu, tick_broadcast_oneshot_mask)) { WARN_ON_ONCE(cpumask_test_cpu(cpu, tick_broadcast_pending_mask)); broadcast_shutdown_local(bc, dev); @@ -813,9 +818,10 @@ int tick_broadcast_oneshot_control(unsigned long reason) } } out: - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); + raw_spin_unlock(&tick_broadcast_lock); return ret; } +EXPORT_SYMBOL_GPL(tick_broadcast_oneshot_control); /* * Reset the one shot broadcast for a cpu diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 62e331d1bc76..0266f9dbd114 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -117,7 +117,6 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } /* Functions related to oneshot broadcasting */ #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); -extern int tick_broadcast_oneshot_control(unsigned long reason); extern void tick_broadcast_switch_to_oneshot(void); extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); extern int tick_broadcast_oneshot_active(void); @@ -126,7 +125,6 @@ bool tick_broadcast_oneshot_available(void); extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } -static inline int tick_broadcast_oneshot_control(unsigned long reason) { return 0; } static inline void tick_broadcast_switch_to_oneshot(void) { } static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } -- cgit v1.3-8-gc7d7 From 335f49196fd6011521f078cb44f445847e5aa183 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:34:49 +0200 Subject: sched/idle: Use explicit broadcast oneshot control function Replace the clockevents_notify() call with an explicit function call. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/6422336.RMm7oUHcXh@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/sched/idle.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 80014a178342..4d207d2abcbd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -158,8 +158,7 @@ static void cpuidle_idle_call(void) * is used from another cpu as a broadcast timer, this call may * fail if it is not available */ - if (broadcast && - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu)) + if (broadcast && tick_broadcast_enter()) goto use_default; /* Take note of the planned idle state. */ @@ -176,7 +175,7 @@ static void cpuidle_idle_call(void) idle_set_state(this_rq(), NULL); if (broadcast) - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu); + tick_broadcast_exit(); /* * Give the governor an opportunity to reflect on the outcome -- cgit v1.3-8-gc7d7 From ffa48c0d76803057ee89bf220305466d74256d7b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Apr 2015 02:36:10 +0200 Subject: clockevents: Remove broadcast oneshot control leftovers Now that all users are converted over to explicit calls into the clockevents state machine, remove the notification chain leftovers. Original-from: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: John Stultz Link: http://lkml.kernel.org/r/14018863.NQUzkFuafr@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 2 -- kernel/time/clockevents.c | 7 ------- 2 files changed, 9 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 438fafa75067..f97f498a5d10 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -11,8 +11,6 @@ /* Clock event notification values */ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_ADD, - CLOCK_EVT_NOTIFY_BROADCAST_ENTER, - CLOCK_EVT_NOTIFY_BROADCAST_EXIT, CLOCK_EVT_NOTIFY_CPU_DYING, CLOCK_EVT_NOTIFY_CPU_DEAD, }; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 7791b1c94ef2..be9abf32c0b9 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -655,13 +655,6 @@ int clockevents_notify(unsigned long reason, void *arg) raw_spin_lock_irqsave(&clockevents_lock, flags); switch (reason) { - case CLOCK_EVT_NOTIFY_BROADCAST_ENTER: - tick_broadcast_enter(); - break; - case CLOCK_EVT_NOTIFY_BROADCAST_EXIT: - tick_broadcast_exit(); - break; - case CLOCK_EVT_NOTIFY_CPU_DYING: tick_handover_do_timer(arg); break; -- cgit v1.3-8-gc7d7 From 52c063d1adbc16c76e70fffa20727fcd4e9343b3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:37:24 +0200 Subject: clockevents: Make tick handover explicit clockevents_notify() is a leftover from the early design of the clockevents facility. It's really not a notification mechanism, it's a multiplex call. We are way better off to have explicit calls instead of this monstrosity. Split out the tick_handover call and invoke it explicitely from the hotplug code. Temporary solution will be cleaned up in later patches. Signed-off-by: Thomas Gleixner [ Rebase ] Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Cc: John Stultz Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1658173.RkEEILFiQZ@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 1 - include/linux/tick.h | 2 ++ kernel/cpu.c | 2 ++ kernel/time/clockevents.c | 4 ---- kernel/time/hrtimer.c | 4 ---- kernel/time/tick-common.c | 9 ++++++--- kernel/time/tick-internal.h | 1 - 7 files changed, 10 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index f97f498a5d10..f4bde22f8a05 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -11,7 +11,6 @@ /* Clock event notification values */ enum clock_event_nofitiers { CLOCK_EVT_NOTIFY_ADD, - CLOCK_EVT_NOTIFY_CPU_DYING, CLOCK_EVT_NOTIFY_CPU_DEAD, }; diff --git a/include/linux/tick.h b/include/linux/tick.h index 6119321fc3be..2c68fa3b9436 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -19,12 +19,14 @@ extern void tick_unfreeze(void); extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); +extern void tick_handover_do_timer(void); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_freeze(void) { } static inline void tick_unfreeze(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } +static inline void tick_handover_do_timer(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #ifdef CONFIG_TICK_ONESHOT diff --git a/kernel/cpu.c b/kernel/cpu.c index af5db20e5803..eba7eaa1341d 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -339,6 +339,8 @@ static int __ref take_cpu_down(void *_param) return err; cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Give up timekeeping duties */ + tick_handover_do_timer(); /* Park the stopper thread */ kthread_park(current); return 0; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index be9abf32c0b9..88fb3b96c7cc 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -655,10 +655,6 @@ int clockevents_notify(unsigned long reason, void *arg) raw_spin_lock_irqsave(&clockevents_lock, flags); switch (reason) { - case CLOCK_EVT_NOTIFY_CPU_DYING: - tick_handover_do_timer(arg); - break; - case CLOCK_EVT_NOTIFY_CPU_DEAD: tick_shutdown_broadcast_oneshot(arg); tick_shutdown_broadcast(arg); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 721d29b99d10..6a7a64ec7d1b 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1707,10 +1707,6 @@ static int hrtimer_cpu_notify(struct notifier_block *self, break; #ifdef CONFIG_HOTPLUG_CPU - case CPU_DYING: - case CPU_DYING_FROZEN: - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu); - break; case CPU_DEAD: case CPU_DEAD_FROZEN: { diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index e28ba5c044c5..055c868f3ec9 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -332,20 +332,23 @@ out_bc: tick_install_broadcast_device(newdev); } +#ifdef CONFIG_HOTPLUG_CPU /* * Transfer the do_timer job away from a dying cpu. * - * Called with interrupts disabled. + * Called with interrupts disabled. Not locking required. If + * tick_do_timer_cpu is owned by this cpu, nothing can change it. */ -void tick_handover_do_timer(int *cpup) +void tick_handover_do_timer(void) { - if (*cpup == tick_do_timer_cpu) { + if (tick_do_timer_cpu == smp_processor_id()) { int cpu = cpumask_first(cpu_online_mask); tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu : TICK_DO_TIMER_NONE; } } +#endif /* * Shutdown an event device on a given cpu: diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index 0266f9dbd114..aabcb5d00cf2 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -20,7 +20,6 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); -extern void tick_handover_do_timer(int *cpup); extern void tick_shutdown(unsigned int *cpup); extern void tick_suspend(void); extern void tick_resume(void); -- cgit v1.3-8-gc7d7 From a49b116dcb1265f238f3169507424257b0519069 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:38:05 +0200 Subject: clockevents: Cleanup dead cpu explicitely clockevents_notify() is a leftover from the early design of the clockevents facility. It's really not a notification mechanism, it's a multiplex call. We are way better off to have explicit calls instead of this monstrosity. Split out the cleanup function for a dead cpu and invoke it directly from the cpu down code. Make it conditional on CPU_HOTPLUG as well. Temporary change, will be refined in the future. Signed-off-by: Thomas Gleixner [ Rebased, added clockevents_notify() removal ] Signed-off-by: Rafael J. Wysocki Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1735025.raBZdQHM3m@vostro.rjw.lan Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 6 ------ include/linux/tick.h | 2 ++ kernel/cpu.c | 1 + kernel/time/clockevents.c | 51 ++++++++++++++++++-------------------------- kernel/time/hrtimer.c | 3 --- kernel/time/tick-broadcast.c | 39 ++++++++++++++++----------------- kernel/time/tick-common.c | 6 +++--- kernel/time/tick-internal.h | 10 ++++----- 8 files changed, 52 insertions(+), 66 deletions(-) (limited to 'kernel') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index f4bde22f8a05..96c280b2c263 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -8,12 +8,6 @@ #ifndef _LINUX_CLOCKCHIPS_H #define _LINUX_CLOCKCHIPS_H -/* Clock event notification values */ -enum clock_event_nofitiers { - CLOCK_EVT_NOTIFY_ADD, - CLOCK_EVT_NOTIFY_CPU_DEAD, -}; - #ifdef CONFIG_GENERIC_CLOCKEVENTS # include diff --git a/include/linux/tick.h b/include/linux/tick.h index 2c68fa3b9436..f8492da57ad3 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -20,6 +20,7 @@ extern void tick_suspend_local(void); /* Should be core only, but XEN resume magic and ARM BL switcher require it */ extern void tick_resume_local(void); extern void tick_handover_do_timer(void); +extern void tick_cleanup_dead_cpu(int cpu); #else /* CONFIG_GENERIC_CLOCKEVENTS */ static inline void tick_init(void) { } static inline void tick_freeze(void) { } @@ -27,6 +28,7 @@ static inline void tick_unfreeze(void) { } static inline void tick_suspend_local(void) { } static inline void tick_resume_local(void) { } static inline void tick_handover_do_timer(void) { } +static inline void tick_cleanup_dead_cpu(int cpu) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #ifdef CONFIG_TICK_ONESHOT diff --git a/kernel/cpu.c b/kernel/cpu.c index eba7eaa1341d..82eea9c5af61 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -419,6 +419,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __cpu_die(cpu); /* CPU is completely dead: tell everyone. Too late to complain. */ + tick_cleanup_dead_cpu(cpu); cpu_notify_nofail(CPU_DEAD | mod, hcpu); check_for_tasks(cpu); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 88fb3b96c7cc..25d942d1da27 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -642,49 +642,40 @@ void clockevents_resume(void) dev->resume(dev); } +#ifdef CONFIG_HOTPLUG_CPU /** - * clockevents_notify - notification about relevant events - * Returns 0 on success, any other value on error + * tick_cleanup_dead_cpu - Cleanup the tick and clockevents of a dead cpu */ -int clockevents_notify(unsigned long reason, void *arg) +void tick_cleanup_dead_cpu(int cpu) { struct clock_event_device *dev, *tmp; unsigned long flags; - int cpu, ret = 0; raw_spin_lock_irqsave(&clockevents_lock, flags); - switch (reason) { - case CLOCK_EVT_NOTIFY_CPU_DEAD: - tick_shutdown_broadcast_oneshot(arg); - tick_shutdown_broadcast(arg); - tick_shutdown(arg); - /* - * Unregister the clock event devices which were - * released from the users in the notify chain. - */ - list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + tick_shutdown_broadcast_oneshot(cpu); + tick_shutdown_broadcast(cpu); + tick_shutdown(cpu); + /* + * Unregister the clock event devices which were + * released from the users in the notify chain. + */ + list_for_each_entry_safe(dev, tmp, &clockevents_released, list) + list_del(&dev->list); + /* + * Now check whether the CPU has left unused per cpu devices + */ + list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { + if (cpumask_test_cpu(cpu, dev->cpumask) && + cpumask_weight(dev->cpumask) == 1 && + !tick_is_broadcast_device(dev)) { + BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); list_del(&dev->list); - /* - * Now check whether the CPU has left unused per cpu devices - */ - cpu = *((int *)arg); - list_for_each_entry_safe(dev, tmp, &clockevent_devices, list) { - if (cpumask_test_cpu(cpu, dev->cpumask) && - cpumask_weight(dev->cpumask) == 1 && - !tick_is_broadcast_device(dev)) { - BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED); - list_del(&dev->list); - } } - break; - default: - break; } raw_spin_unlock_irqrestore(&clockevents_lock, flags); - return ret; } -EXPORT_SYMBOL_GPL(clockevents_notify); +#endif #ifdef CONFIG_SYSFS struct bus_type clockevents_subsys = { diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6a7a64ec7d1b..76d4bd962b19 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1709,11 +1709,8 @@ static int hrtimer_cpu_notify(struct notifier_block *self, #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: case CPU_DEAD_FROZEN: - { - clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DEAD, &scpu); migrate_hrtimers(scpu); break; - } #endif default: diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 55e43f20987a..7e8ca4f448a8 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -410,14 +410,14 @@ void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast) dev->event_handler = tick_handle_periodic_broadcast; } +#ifdef CONFIG_HOTPLUG_CPU /* * Remove a CPU from broadcasting */ -void tick_shutdown_broadcast(unsigned int *cpup) +void tick_shutdown_broadcast(unsigned int cpu) { struct clock_event_device *bc; unsigned long flags; - unsigned int cpu = *cpup; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -432,6 +432,7 @@ void tick_shutdown_broadcast(unsigned int *cpup) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#endif void tick_suspend_broadcast(void) { @@ -672,21 +673,6 @@ static void broadcast_shutdown_local(struct clock_event_device *bc, clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN); } -void hotplug_cpu__broadcast_tick_pull(int deadcpu) -{ - struct clock_event_device *bc; - unsigned long flags; - - raw_spin_lock_irqsave(&tick_broadcast_lock, flags); - bc = tick_broadcast_device.evtdev; - - if (bc && broadcast_needs_cpu(bc, deadcpu)) { - /* This moves the broadcast assignment to this CPU: */ - clockevents_program_event(bc, bc->next_event, 1); - } - raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); -} - /** * tick_broadcast_oneshot_control - Enter/exit broadcast oneshot mode * @state: The target state (enter/exit) @@ -908,14 +894,28 @@ void tick_broadcast_switch_to_oneshot(void) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#ifdef CONFIG_HOTPLUG_CPU +void hotplug_cpu__broadcast_tick_pull(int deadcpu) +{ + struct clock_event_device *bc; + unsigned long flags; + + raw_spin_lock_irqsave(&tick_broadcast_lock, flags); + bc = tick_broadcast_device.evtdev; + + if (bc && broadcast_needs_cpu(bc, deadcpu)) { + /* This moves the broadcast assignment to this CPU: */ + clockevents_program_event(bc, bc->next_event, 1); + } + raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); +} /* * Remove a dead CPU from broadcasting */ -void tick_shutdown_broadcast_oneshot(unsigned int *cpup) +void tick_shutdown_broadcast_oneshot(unsigned int cpu) { unsigned long flags; - unsigned int cpu = *cpup; raw_spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -929,6 +929,7 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } +#endif /* * Check, whether the broadcast device is in one shot mode diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 055c868f3ec9..fac3e98fec49 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -348,7 +348,6 @@ void tick_handover_do_timer(void) TICK_DO_TIMER_NONE; } } -#endif /* * Shutdown an event device on a given cpu: @@ -357,9 +356,9 @@ void tick_handover_do_timer(void) * access the hardware device itself. * We just set the mode and remove it from the lists. */ -void tick_shutdown(unsigned int *cpup) +void tick_shutdown(unsigned int cpu) { - struct tick_device *td = &per_cpu(tick_cpu_device, *cpup); + struct tick_device *td = &per_cpu(tick_cpu_device, cpu); struct clock_event_device *dev = td->evtdev; td->mode = TICKDEV_MODE_PERIODIC; @@ -375,6 +374,7 @@ void tick_shutdown(unsigned int *cpup) td->evtdev = NULL; } } +#endif /** * tick_suspend_local - Suspend the local tick device diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index aabcb5d00cf2..b64fdd8054c5 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -20,7 +20,7 @@ extern int tick_do_timer_cpu __read_mostly; extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_check_new_device(struct clock_event_device *dev); -extern void tick_shutdown(unsigned int *cpup); +extern void tick_shutdown(unsigned int cpu); extern void tick_suspend(void); extern void tick_resume(void); extern bool tick_check_replacement(struct clock_event_device *curdev, @@ -52,7 +52,7 @@ extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt); extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); extern void tick_install_broadcast_device(struct clock_event_device *dev); extern int tick_is_broadcast_device(struct clock_event_device *dev); -extern void tick_shutdown_broadcast(unsigned int *cpup); +extern void tick_shutdown_broadcast(unsigned int cpu); extern void tick_suspend_broadcast(void); extern void tick_resume_broadcast(void); extern bool tick_resume_check_broadcast(void); @@ -66,7 +66,7 @@ static inline void tick_install_broadcast_device(struct clock_event_device *dev) static inline int tick_is_broadcast_device(struct clock_event_device *dev) { return 0; } static inline int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) { return 0; } static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { } -static inline void tick_shutdown_broadcast(unsigned int *cpup) { } +static inline void tick_shutdown_broadcast(unsigned int cpu) { } static inline void tick_suspend_broadcast(void) { } static inline void tick_resume_broadcast(void) { } static inline bool tick_resume_check_broadcast(void) { return false; } @@ -117,7 +117,7 @@ static inline int tick_check_oneshot_change(int allow_nohz) { return 0; } #if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_broadcast_setup_oneshot(struct clock_event_device *bc); extern void tick_broadcast_switch_to_oneshot(void); -extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); +extern void tick_shutdown_broadcast_oneshot(unsigned int cpu); extern int tick_broadcast_oneshot_active(void); extern void tick_check_oneshot_broadcast_this_cpu(void); bool tick_broadcast_oneshot_available(void); @@ -125,7 +125,7 @@ extern struct cpumask *tick_get_broadcast_oneshot_mask(void); #else /* !(BROADCAST && ONESHOT): */ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { BUG(); } static inline void tick_broadcast_switch_to_oneshot(void) { } -static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } +static inline void tick_shutdown_broadcast_oneshot(unsigned int cpu) { } static inline int tick_broadcast_oneshot_active(void) { return 0; } static inline void tick_check_oneshot_broadcast_this_cpu(void) { } static inline bool tick_broadcast_oneshot_available(void) { return tick_oneshot_possible(); } -- cgit v1.3-8-gc7d7 From 347c6f6dda1098318088feb8e60188f0161e743d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 3 Apr 2015 02:39:05 +0200 Subject: timekeeping: Get rid of stale comment Arch specific management of xtime/jiffies/wall_to_monotonic is gone for quite a while. Zap the stale comment. Signed-off-by: Thomas Gleixner Signed-off-by: Rafael J. Wysocki Acked-by: John Stultz Cc: Peter Zijlstra Cc: John Stultz Link: http://lkml.kernel.org/r/2422730.dmO29q661S@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/timekeeping.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 79b9bc6e7876..946acb72179f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1354,10 +1354,6 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta) /** * timekeeping_resume - Resumes the generic timekeeping subsystem. - * - * This is for the generic clocksource timekeeping. - * xtime/wall_to_monotonic/jiffies/etc are - * still managed by arch specific suspend/resume code. */ void timekeeping_resume(void) { -- cgit v1.3-8-gc7d7 From 62a935b256f68a71697716595347209fb5275426 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Apr 2015 10:42:50 +0200 Subject: sched/core: Drop debugging leftover trace_printk call Commit: 3c18d447b3b3 ("sched/core: Check for available DL bandwidth in cpuset_cpu_inactive()") forgot a trace_printk() debugging piece in and Steve's banner screamed in dmesg. Remove it. Signed-off-by: Borislav Petkov Cc: Juri Lelli Cc: Juri Lelli Cc: Peter Zijlstra (Intel) Cc: Steven Rostedt Link: http://lkml.kernel.org/r/1428050570-21041-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 28b0d75a8273..8027cfd699d0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7015,10 +7015,8 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action, rcu_read_unlock_sched(); - if (overflow) { - trace_printk("hotplug failed for cpu %lu", cpu); + if (overflow) return notifier_from_errno(-EBUSY); - } } cpuset_update_active_cpus(false); break; -- cgit v1.3-8-gc7d7 From 422fe7502e3f16dc1c680f22d31f59f022edc10d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Apr 2015 15:21:51 +0200 Subject: timers/PM: Fix up tick_unfreeze() A recent conflict resolution has left tick_resume() in tick_unfreeze() which leads to an unbalanced execution of tick_resume_broadcast() every time that function runs. Fix that by replacing the tick_resume() in tick_unfreeze() with tick_resume_local() as appropriate. Signed-off-by: Rafael J. Wysocki Cc: boris.ostrovsky@oracle.com Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: peterz@infradead.org Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8099075.V0LvN3pQAV@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/tick-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index fac3e98fec49..ad66a51ca4fa 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -482,7 +482,7 @@ void tick_unfreeze(void) if (tick_freeze_depth == num_online_cpus()) timekeeping_resume(); else - tick_resume(); + tick_resume_local(); tick_freeze_depth--; -- cgit v1.3-8-gc7d7 From def747087e83aa5f6a71582cfa71e18341988688 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 3 Apr 2015 15:31:32 +0200 Subject: timers/PM: Drop unnecessary braces from tick_freeze() Some braces in tick_freeze() are not necessary, so drop them. Signed-off-by: Rafael J. Wysocki Cc: peterz@infradead.org Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1534128.H5hN3KBFB4@vostro.rjw.lan Signed-off-by: Ingo Molnar --- kernel/time/tick-common.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index ad66a51ca4fa..3ae6afa1eb98 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -457,11 +457,10 @@ void tick_freeze(void) raw_spin_lock(&tick_freeze_lock); tick_freeze_depth++; - if (tick_freeze_depth == num_online_cpus()) { + if (tick_freeze_depth == num_online_cpus()) timekeeping_suspend(); - } else { + else tick_suspend_local(); - } raw_spin_unlock(&tick_freeze_lock); } -- cgit v1.3-8-gc7d7 From 6ba94429c8e7b87b0fff13c5ac90731b239b77fa Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 2 Apr 2015 19:14:39 +0800 Subject: workqueue: Reorder sysfs code The sysfs code usually belongs to the botom of the file since it deals with high level objects. In the workqueue code it's misplaced and such that we'll need to work around functions references to allow the sysfs code to call APIs like apply_workqueue_attrs(). Lets move that block further in the file, almost the botom. And declare workqueue_sysfs_unregister() just before destroy_workqueue() which reference it. tj: Moved workqueue_sysfs_unregister() forward declaration where other forward declarations are. Suggested-by: Tejun Heo Cc: Christoph Lameter Cc: Kevin Hilman Cc: Lai Jiangshan Cc: Mike Galbraith Cc: Paul E. McKenney Cc: Tejun Heo Cc: Viresh Kumar Signed-off-by: Frederic Weisbecker Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 1627 ++++++++++++++++++++++++++-------------------------- 1 file changed, 814 insertions(+), 813 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1ca0b1d54e70..586ad91300b0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -332,6 +332,7 @@ EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); static int worker_thread(void *__worker); static void copy_workqueue_attrs(struct workqueue_attrs *to, const struct workqueue_attrs *from); +static void workqueue_sysfs_unregister(struct workqueue_struct *wq); #define CREATE_TRACE_POINTS #include @@ -3001,792 +3002,475 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -#ifdef CONFIG_SYSFS -/* - * Workqueues with WQ_SYSFS flag set is visible to userland via - * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the - * following attributes. - * - * per_cpu RO bool : whether the workqueue is per-cpu or unbound - * max_active RW int : maximum number of in-flight work items - * - * Unbound workqueues have the following extra attributes. +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free * - * id RO int : the associated pool ID - * nice RW int : nice value of the workers - * cpumask RW mask : bitmask of allowed CPUs for the workers + * Undo alloc_workqueue_attrs(). */ -struct wq_device { - struct workqueue_struct *wq; - struct device dev; -}; - -static struct workqueue_struct *dev_to_wq(struct device *dev) +void free_workqueue_attrs(struct workqueue_attrs *attrs) { - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - - return wq_dev->wq; + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } } -static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, - char *buf) +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. + * + * Return: The allocated new workqueue_attr on success. %NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) { - struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; - return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_copy(attrs->cpumask, cpu_possible_mask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; } -static DEVICE_ATTR_RO(per_cpu); -static ssize_t max_active_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) { - struct workqueue_struct *wq = dev_to_wq(dev); - - return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); + /* + * Unlike hash and equality test, this function doesn't ignore + * ->no_numa as it is used for both pool and wq attrs. Instead, + * get_unbound_pool() explicitly clears ->no_numa after copying. + */ + to->no_numa = from->no_numa; } -static ssize_t max_active_store(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) { - struct workqueue_struct *wq = dev_to_wq(dev); - int val; - - if (sscanf(buf, "%d", &val) != 1 || val <= 0) - return -EINVAL; + u32 hash = 0; - workqueue_set_max_active(wq, val); - return count; + hash = jhash_1word(attrs->nice, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + return hash; } -static DEVICE_ATTR_RW(max_active); - -static struct attribute *wq_sysfs_attrs[] = { - &dev_attr_per_cpu.attr, - &dev_attr_max_active.attr, - NULL, -}; -ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) { - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - rcu_read_lock_sched(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock_sched(); - - return written; + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; } -static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, - char *buf) +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * + * Return: 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. + */ +static int init_worker_pool(struct worker_pool *pool) { - struct workqueue_struct *wq = dev_to_wq(dev); - int written; + spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; + pool->node = NUMA_NO_NODE; + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); - mutex_unlock(&wq->mutex); + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; - return written; -} + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); -/* prepare workqueue_attrs for sysfs store operations */ -static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) -{ - struct workqueue_attrs *attrs; + mutex_init(&pool->manager_arb); + mutex_init(&pool->attach_mutex); + INIT_LIST_HEAD(&pool->workers); - attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!attrs) - return NULL; + ida_init(&pool->worker_ida); + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; - mutex_lock(&wq->mutex); - copy_workqueue_attrs(attrs, wq->unbound_attrs); - mutex_unlock(&wq->mutex); - return attrs; + /* shouldn't fail above this point */ + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; } -static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) +static void rcu_free_wq(struct rcu_head *rcu) { - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + struct workqueue_struct *wq = + container_of(rcu, struct workqueue_struct, rcu); - if (sscanf(buf, "%d", &attrs->nice) == 1 && - attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) - ret = apply_workqueue_attrs(wq, attrs); + if (!(wq->flags & WQ_UNBOUND)) + free_percpu(wq->cpu_pwqs); else - ret = -EINVAL; + free_workqueue_attrs(wq->unbound_attrs); - free_workqueue_attrs(attrs); - return ret ?: count; + kfree(wq->rescuer); + kfree(wq); } -static ssize_t wq_cpumask_show(struct device *dev, - struct device_attribute *attr, char *buf) +static void rcu_free_pool(struct rcu_head *rcu) { - struct workqueue_struct *wq = dev_to_wq(dev); - int written; + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%*pb\n", - cpumask_pr_args(wq->unbound_attrs->cpumask)); - mutex_unlock(&wq->mutex); - return written; + ida_destroy(&pool->worker_ida); + free_workqueue_attrs(pool->attrs); + kfree(pool); } -static ssize_t wq_cpumask_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. + */ +static void put_unbound_pool(struct worker_pool *pool) { - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int ret; + DECLARE_COMPLETION_ONSTACK(detach_completion); + struct worker *worker; - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + lockdep_assert_held(&wq_pool_mutex); - ret = cpumask_parse(buf, attrs->cpumask); - if (!ret) - ret = apply_workqueue_attrs(wq, attrs); + if (--pool->refcnt) + return; - free_workqueue_attrs(attrs); - return ret ?: count; -} + /* sanity checks */ + if (WARN_ON(!(pool->cpu < 0)) || + WARN_ON(!list_empty(&pool->worklist))) + return; -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret; - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - return -ENOMEM; + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs(wq, attrs); - } + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * attach_mutex. + */ + mutex_lock(&pool->manager_arb); - free_workqueue_attrs(attrs); - return ret ?: count; -} + spin_lock_irq(&pool->lock); + while ((worker = first_idle_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + spin_unlock_irq(&pool->lock); -static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), - __ATTR(nice, 0644, wq_nice_show, wq_nice_store), - __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), - __ATTR_NULL, -}; + mutex_lock(&pool->attach_mutex); + if (!list_empty(&pool->workers)) + pool->detach_completion = &detach_completion; + mutex_unlock(&pool->attach_mutex); -static struct bus_type wq_subsys = { - .name = "workqueue", - .dev_groups = wq_sysfs_groups, -}; + if (pool->detach_completion) + wait_for_completion(pool->detach_completion); -static int __init wq_sysfs_init(void) -{ - return subsys_virtual_register(&wq_subsys, NULL); -} -core_initcall(wq_sysfs_init); + mutex_unlock(&pool->manager_arb); -static void wq_device_release(struct device *dev) -{ - struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); - kfree(wq_dev); + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); } /** - * workqueue_sysfs_register - make a workqueue visible in sysfs - * @wq: the workqueue to register + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get * - * Expose @wq in sysfs under /sys/bus/workqueue/devices. - * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set - * which is the preferred method. + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. * - * Workqueue user should use this function directly iff it wants to apply - * workqueue_attrs before making the workqueue visible in sysfs; otherwise, - * apply_workqueue_attrs() may race against userland updating the - * attributes. + * Should be called with wq_pool_mutex held. * - * Return: 0 on success, -errno on failure. + * Return: On success, a worker_pool with the same attributes as @attrs. + * On failure, %NULL. */ -int workqueue_sysfs_register(struct workqueue_struct *wq) +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) { - struct wq_device *wq_dev; - int ret; + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + int node; - /* - * Adjusting max_active or creating new pwqs by applyting - * attributes breaks ordering guarantee. Disallow exposing ordered - * workqueues. - */ - if (WARN_ON(wq->flags & __WQ_ORDERED)) - return -EINVAL; + lockdep_assert_held(&wq_pool_mutex); - wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); - if (!wq_dev) - return -ENOMEM; + /* do we already have a matching pool? */ + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + return pool; + } + } - wq_dev->wq = wq; - wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; - wq_dev->dev.release = wq_device_release; + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + copy_workqueue_attrs(pool->attrs, attrs); /* - * unbound_attrs are created separately. Suppress uevent until - * everything is ready. + * no_numa isn't a worker_pool attribute, always clear it. See + * 'struct workqueue_attrs' comments for detail. */ - dev_set_uevent_suppress(&wq_dev->dev, true); - - ret = device_register(&wq_dev->dev); - if (ret) { - kfree(wq_dev); - wq->wq_dev = NULL; - return ret; - } - - if (wq->flags & WQ_UNBOUND) { - struct device_attribute *attr; + pool->attrs->no_numa = false; - for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { - ret = device_create_file(&wq_dev->dev, attr); - if (ret) { - device_unregister(&wq_dev->dev); - wq->wq_dev = NULL; - return ret; + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; } } } - dev_set_uevent_suppress(&wq_dev->dev, false); - kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); - return 0; -} + if (worker_pool_assign_id(pool) < 0) + goto fail; -/** - * workqueue_sysfs_unregister - undo workqueue_sysfs_register() - * @wq: the workqueue to unregister - * - * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. - */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) -{ - struct wq_device *wq_dev = wq->wq_dev; + /* create and start the initial worker */ + if (!create_worker(pool)) + goto fail; - if (!wq->wq_dev) - return; + /* install */ + hash_add(unbound_pool_hash, &pool->hash_node, hash); - wq->wq_dev = NULL; - device_unregister(&wq_dev->dev); + return pool; +fail: + if (pool) + put_unbound_pool(pool); + return NULL; } -#else /* CONFIG_SYSFS */ -static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } -#endif /* CONFIG_SYSFS */ -/** - * free_workqueue_attrs - free a workqueue_attrs - * @attrs: workqueue_attrs to free - * - * Undo alloc_workqueue_attrs(). - */ -void free_workqueue_attrs(struct workqueue_attrs *attrs) +static void rcu_free_pwq(struct rcu_head *rcu) { - if (attrs) { - free_cpumask_var(attrs->cpumask); - kfree(attrs); - } + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); } -/** - * alloc_workqueue_attrs - allocate a workqueue_attrs - * @gfp_mask: allocation mask to use - * - * Allocate a new workqueue_attrs, initialize with default settings and - * return it. - * - * Return: The allocated new workqueue_attr on success. %NULL on failure. +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. */ -struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +static void pwq_unbound_release_workfn(struct work_struct *work) { - struct workqueue_attrs *attrs; + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + bool is_last; - attrs = kzalloc(sizeof(*attrs), gfp_mask); - if (!attrs) - goto fail; - if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) - goto fail; + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; - cpumask_copy(attrs->cpumask, cpu_possible_mask); - return attrs; -fail: - free_workqueue_attrs(attrs); - return NULL; -} + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + + call_rcu_sched(&pwq->rcu, rcu_free_pwq); -static void copy_workqueue_attrs(struct workqueue_attrs *to, - const struct workqueue_attrs *from) -{ - to->nice = from->nice; - cpumask_copy(to->cpumask, from->cpumask); /* - * Unlike hash and equality test, this function doesn't ignore - * ->no_numa as it is used for both pool and wq attrs. Instead, - * get_unbound_pool() explicitly clears ->no_numa after copying. + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Schedule RCU free. */ - to->no_numa = from->no_numa; -} - -/* hash value of the content of @attr */ -static u32 wqattrs_hash(const struct workqueue_attrs *attrs) -{ - u32 hash = 0; - - hash = jhash_1word(attrs->nice, hash); - hash = jhash(cpumask_bits(attrs->cpumask), - BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); - return hash; -} - -/* content equality test */ -static bool wqattrs_equal(const struct workqueue_attrs *a, - const struct workqueue_attrs *b) -{ - if (a->nice != b->nice) - return false; - if (!cpumask_equal(a->cpumask, b->cpumask)) - return false; - return true; + if (is_last) + call_rcu_sched(&wq->rcu, rcu_free_wq); } /** - * init_worker_pool - initialize a newly zalloc'd worker_pool - * @pool: worker_pool to initialize - * - * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * pwq_adjust_max_active - update a pwq's max_active to the current setting + * @pwq: target pool_workqueue * - * Return: 0 on success, -errno on failure. Even on failure, all fields - * inside @pool proper are initialized and put_unbound_pool() can be called - * on @pool safely to release it. + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. */ -static int init_worker_pool(struct worker_pool *pool) +static void pwq_adjust_max_active(struct pool_workqueue *pwq) { - spin_lock_init(&pool->lock); - pool->id = -1; - pool->cpu = -1; - pool->node = NUMA_NO_NODE; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; + struct workqueue_struct *wq = pwq->wq; + bool freezable = wq->flags & WQ_FREEZABLE; - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); + /* for @wq->saved_max_active */ + lockdep_assert_held(&wq->mutex); - mutex_init(&pool->manager_arb); - mutex_init(&pool->attach_mutex); - INIT_LIST_HEAD(&pool->workers); + /* fast exit for non-freezable wqs */ + if (!freezable && pwq->max_active == wq->saved_max_active) + return; - ida_init(&pool->worker_ida); - INIT_HLIST_NODE(&pool->hash_node); - pool->refcnt = 1; + spin_lock_irq(&pwq->pool->lock); - /* shouldn't fail above this point */ - pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); - if (!pool->attrs) - return -ENOMEM; - return 0; -} + /* + * During [un]freezing, the caller is responsible for ensuring that + * this function is called at least once after @workqueue_freezing + * is updated and visible. + */ + if (!freezable || !workqueue_freezing) { + pwq->max_active = wq->saved_max_active; -static void rcu_free_wq(struct rcu_head *rcu) -{ - struct workqueue_struct *wq = - container_of(rcu, struct workqueue_struct, rcu); + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); - else - free_workqueue_attrs(wq->unbound_attrs); + /* + * Need to kick a worker after thawed or an unbound wq's + * max_active is bumped. It's a slow path. Do it always. + */ + wake_up_worker(pwq->pool); + } else { + pwq->max_active = 0; + } - kfree(wq->rescuer); - kfree(wq); + spin_unlock_irq(&pwq->pool->lock); } -static void rcu_free_pool(struct rcu_head *rcu) +/* initialize newly alloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, + struct worker_pool *pool) { - struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - ida_destroy(&pool->worker_ida); - free_workqueue_attrs(pool->attrs); - kfree(pool); + memset(pwq, 0, sizeof(*pwq)); + + pwq->pool = pool; + pwq->wq = wq; + pwq->flush_color = -1; + pwq->refcnt = 1; + INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->pwqs_node); + INIT_LIST_HEAD(&pwq->mayday_node); + INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); } -/** - * put_unbound_pool - put a worker_pool - * @pool: worker_pool to put - * - * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU - * safe manner. get_unbound_pool() calls this function on its failure path - * and this function should be able to release pools which went through, - * successfully or not, init_worker_pool(). - * - * Should be called with wq_pool_mutex held. - */ -static void put_unbound_pool(struct worker_pool *pool) +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq) { - DECLARE_COMPLETION_ONSTACK(detach_completion); - struct worker *worker; + struct workqueue_struct *wq = pwq->wq; - lockdep_assert_held(&wq_pool_mutex); + lockdep_assert_held(&wq->mutex); - if (--pool->refcnt) + /* may be called multiple times, ignore if already linked */ + if (!list_empty(&pwq->pwqs_node)) return; - /* sanity checks */ - if (WARN_ON(!(pool->cpu < 0)) || - WARN_ON(!list_empty(&pool->worklist))) - return; + /* set the matching work_color */ + pwq->work_color = wq->work_color; - /* release id and unhash */ - if (pool->id >= 0) - idr_remove(&worker_pool_idr, pool->id); - hash_del(&pool->hash_node); + /* sync max_active to the current setting */ + pwq_adjust_max_active(pwq); - /* - * Become the manager and destroy all workers. Grabbing - * manager_arb prevents @pool's workers from blocking on - * attach_mutex. - */ - mutex_lock(&pool->manager_arb); + /* link in @pwq */ + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} - spin_lock_irq(&pool->lock); - while ((worker = first_idle_worker(pool))) - destroy_worker(worker); - WARN_ON(pool->nr_workers || pool->nr_idle); - spin_unlock_irq(&pool->lock); +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct worker_pool *pool; + struct pool_workqueue *pwq; - mutex_lock(&pool->attach_mutex); - if (!list_empty(&pool->workers)) - pool->detach_completion = &detach_completion; - mutex_unlock(&pool->attach_mutex); + lockdep_assert_held(&wq_pool_mutex); - if (pool->detach_completion) - wait_for_completion(pool->detach_completion); + pool = get_unbound_pool(attrs); + if (!pool) + return NULL; - mutex_unlock(&pool->manager_arb); + pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); + if (!pwq) { + put_unbound_pool(pool); + return NULL; + } - /* shut down the timers */ - del_timer_sync(&pool->idle_timer); - del_timer_sync(&pool->mayday_timer); + init_pwq(pwq, wq, pool); + return pwq; +} - /* sched-RCU protected to allow dereferences from get_work_pool() */ - call_rcu_sched(&pool->rcu, rcu_free_pool); +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&wq_pool_mutex); + + if (pwq) { + put_unbound_pool(pwq->pool); + kmem_cache_free(pwq_cache, pwq); + } } /** - * get_unbound_pool - get a worker_pool with the specified attributes - * @attrs: the attributes of the worker_pool to get + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask * - * Obtain a worker_pool which has the same attributes as @attrs, bump the - * reference count and return it. If there already is a matching - * worker_pool, it will be used; otherwise, this function attempts to - * create a new one. + * Calculate the cpumask a workqueue with @attrs should use on @node. If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation. The result is stored in @cpumask. * - * Should be called with wq_pool_mutex held. + * If NUMA affinity is not enabled, @attrs->cpumask is always used. If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. * - * Return: On success, a worker_pool with the same attributes as @attrs. - * On failure, %NULL. + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + * + * Return: %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. */ -static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, + int cpu_going_down, cpumask_t *cpumask) { - u32 hash = wqattrs_hash(attrs); - struct worker_pool *pool; - int node; + if (!wq_numa_enabled || attrs->no_numa) + goto use_dfl; - lockdep_assert_held(&wq_pool_mutex); + /* does @node have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + if (cpu_going_down >= 0) + cpumask_clear_cpu(cpu_going_down, cpumask); - /* do we already have a matching pool? */ - hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { - if (wqattrs_equal(pool->attrs, attrs)) { - pool->refcnt++; - return pool; - } - } - - /* nope, create a new one */ - pool = kzalloc(sizeof(*pool), GFP_KERNEL); - if (!pool || init_worker_pool(pool) < 0) - goto fail; - - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ - copy_workqueue_attrs(pool->attrs, attrs); - - /* - * no_numa isn't a worker_pool attribute, always clear it. See - * 'struct workqueue_attrs' comments for detail. - */ - pool->attrs->no_numa = false; - - /* if cpumask is contained inside a NUMA node, we belong to that node */ - if (wq_numa_enabled) { - for_each_node(node) { - if (cpumask_subset(pool->attrs->cpumask, - wq_numa_possible_cpumask[node])) { - pool->node = node; - break; - } - } - } - - if (worker_pool_assign_id(pool) < 0) - goto fail; - - /* create and start the initial worker */ - if (!create_worker(pool)) - goto fail; - - /* install */ - hash_add(unbound_pool_hash, &pool->hash_node, hash); - - return pool; -fail: - if (pool) - put_unbound_pool(pool); - return NULL; -} - -static void rcu_free_pwq(struct rcu_head *rcu) -{ - kmem_cache_free(pwq_cache, - container_of(rcu, struct pool_workqueue, rcu)); -} - -/* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. - */ -static void pwq_unbound_release_workfn(struct work_struct *work) -{ - struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); - struct workqueue_struct *wq = pwq->wq; - struct worker_pool *pool = pwq->pool; - bool is_last; - - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - - mutex_lock(&wq->mutex); - list_del_rcu(&pwq->pwqs_node); - is_last = list_empty(&wq->pwqs); - mutex_unlock(&wq->mutex); - - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); - - call_rcu_sched(&pwq->rcu, rcu_free_pwq); - - /* - * If we're the last pwq going away, @wq is already dead and no one - * is gonna access it anymore. Schedule RCU free. - */ - if (is_last) - call_rcu_sched(&wq->rcu, rcu_free_wq); -} - -/** - * pwq_adjust_max_active - update a pwq's max_active to the current setting - * @pwq: target pool_workqueue - * - * If @pwq isn't freezing, set @pwq->max_active to the associated - * workqueue's saved_max_active and activate delayed work items - * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. - */ -static void pwq_adjust_max_active(struct pool_workqueue *pwq) -{ - struct workqueue_struct *wq = pwq->wq; - bool freezable = wq->flags & WQ_FREEZABLE; - - /* for @wq->saved_max_active */ - lockdep_assert_held(&wq->mutex); - - /* fast exit for non-freezable wqs */ - if (!freezable && pwq->max_active == wq->saved_max_active) - return; - - spin_lock_irq(&pwq->pool->lock); - - /* - * During [un]freezing, the caller is responsible for ensuring that - * this function is called at least once after @workqueue_freezing - * is updated and visible. - */ - if (!freezable || !workqueue_freezing) { - pwq->max_active = wq->saved_max_active; - - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); - - /* - * Need to kick a worker after thawed or an unbound wq's - * max_active is bumped. It's a slow path. Do it always. - */ - wake_up_worker(pwq->pool); - } else { - pwq->max_active = 0; - } - - spin_unlock_irq(&pwq->pool->lock); -} - -/* initialize newly alloced @pwq which is associated with @wq and @pool */ -static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, - struct worker_pool *pool) -{ - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - - memset(pwq, 0, sizeof(*pwq)); - - pwq->pool = pool; - pwq->wq = wq; - pwq->flush_color = -1; - pwq->refcnt = 1; - INIT_LIST_HEAD(&pwq->delayed_works); - INIT_LIST_HEAD(&pwq->pwqs_node); - INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); -} - -/* sync @pwq with the current state of its associated wq and link it */ -static void link_pwq(struct pool_workqueue *pwq) -{ - struct workqueue_struct *wq = pwq->wq; - - lockdep_assert_held(&wq->mutex); - - /* may be called multiple times, ignore if already linked */ - if (!list_empty(&pwq->pwqs_node)) - return; - - /* set the matching work_color */ - pwq->work_color = wq->work_color; - - /* sync max_active to the current setting */ - pwq_adjust_max_active(pwq); - - /* link in @pwq */ - list_add_rcu(&pwq->pwqs_node, &wq->pwqs); -} - -/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ -static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, - const struct workqueue_attrs *attrs) -{ - struct worker_pool *pool; - struct pool_workqueue *pwq; - - lockdep_assert_held(&wq_pool_mutex); - - pool = get_unbound_pool(attrs); - if (!pool) - return NULL; - - pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); - if (!pwq) { - put_unbound_pool(pool); - return NULL; - } - - init_pwq(pwq, wq, pool); - return pwq; -} - -/* undo alloc_unbound_pwq(), used only in the error path */ -static void free_unbound_pwq(struct pool_workqueue *pwq) -{ - lockdep_assert_held(&wq_pool_mutex); - - if (pwq) { - put_unbound_pool(pwq->pool); - kmem_cache_free(pwq_cache, pwq); - } -} - -/** - * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node - * @attrs: the wq_attrs of interest - * @node: the target NUMA node - * @cpu_going_down: if >= 0, the CPU to consider as offline - * @cpumask: outarg, the resulting cpumask - * - * Calculate the cpumask a workqueue with @attrs should use on @node. If - * @cpu_going_down is >= 0, that cpu is considered offline during - * calculation. The result is stored in @cpumask. - * - * If NUMA affinity is not enabled, @attrs->cpumask is always used. If - * enabled and @node has online CPUs requested by @attrs, the returned - * cpumask is the intersection of the possible CPUs of @node and - * @attrs->cpumask. - * - * The caller is responsible for ensuring that the cpumask of @node stays - * stable. - * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. - */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, - int cpu_going_down, cpumask_t *cpumask) -{ - if (!wq_numa_enabled || attrs->no_numa) - goto use_dfl; - - /* does @node have any online CPUs @attrs wants? */ - cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); - if (cpu_going_down >= 0) - cpumask_clear_cpu(cpu_going_down, cpumask); - - if (cpumask_empty(cpumask)) - goto use_dfl; + if (cpumask_empty(cpumask)) + goto use_dfl; /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); @@ -4817,202 +4501,519 @@ static int workqueue_cpu_up_callback(struct notifier_block *nfb, else if (pool->cpu < 0) restore_unbound_workers_cpumask(pool, cpu); - mutex_unlock(&pool->attach_mutex); - } + mutex_unlock(&pool->attach_mutex); + } + + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + + mutex_unlock(&wq_pool_mutex); + break; + } + return NOTIFY_OK; +} + +/* + * Workqueues should be brought down after normal priority CPU notifiers. + * This will be registered as low priority CPU notifier. + */ +static int workqueue_cpu_down_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct work_struct unbind_work; + struct workqueue_struct *wq; + + switch (action & ~CPU_TASKS_FROZEN) { + case CPU_DOWN_PREPARE: + /* unbinding per-cpu workers should happen on the local CPU */ + INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); + queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ + flush_work(&unbind_work); + destroy_work_on_stack(&unbind_work); + break; + } + return NOTIFY_OK; +} + +#ifdef CONFIG_SMP + +struct work_for_cpu { + struct work_struct work; + long (*fn)(void *); + void *arg; + long ret; +}; + +static void work_for_cpu_fn(struct work_struct *work) +{ + struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); + + wfc->ret = wfc->fn(wfc->arg); +} + +/** + * work_on_cpu - run a function in user context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function arg + * + * It is up to the caller to ensure that the cpu doesn't go offline. + * The caller must not hold any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +{ + struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + + INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); + schedule_work_on(cpu, &wfc.work); + flush_work(&wfc.work); + destroy_work_on_stack(&wfc.work); + return wfc.ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu); +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_FREEZER + +/** + * freeze_workqueues_begin - begin freezing workqueues + * + * Start freezing workqueues. After this function returns, all freezable + * workqueues will queue new works to their delayed_works list instead of + * pool->worklist. + * + * CONTEXT: + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + */ +void freeze_workqueues_begin(void) +{ + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + WARN_ON_ONCE(workqueue_freezing); + workqueue_freezing = true; + + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + + mutex_unlock(&wq_pool_mutex); +} + +/** + * freeze_workqueues_busy - are freezable workqueues still busy? + * + * Check whether freezing is complete. This function must be called + * between freeze_workqueues_begin() and thaw_workqueues(). + * + * CONTEXT: + * Grabs and releases wq_pool_mutex. + * + * Return: + * %true if some freezable workqueues are still busy. %false if freezing + * is complete. + */ +bool freeze_workqueues_busy(void) +{ + bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + WARN_ON_ONCE(!workqueue_freezing); + + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; + /* + * nr_active is monotonically decreasing. It's safe + * to peek without lock. + */ + rcu_read_lock_sched(); + for_each_pwq(pwq, wq) { + WARN_ON_ONCE(pwq->nr_active < 0); + if (pwq->nr_active) { + busy = true; + rcu_read_unlock_sched(); + goto out_unlock; + } + } + rcu_read_unlock_sched(); + } +out_unlock: + mutex_unlock(&wq_pool_mutex); + return busy; +} + +/** + * thaw_workqueues - thaw workqueues + * + * Thaw workqueues. Normal queueing is restored and all collected + * frozen works are transferred to their respective pool worklists. + * + * CONTEXT: + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + */ +void thaw_workqueues(void) +{ + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + + mutex_lock(&wq_pool_mutex); + + if (!workqueue_freezing) + goto out_unlock; + + workqueue_freezing = false; + + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + +out_unlock: + mutex_unlock(&wq_pool_mutex); +} +#endif /* CONFIG_FREEZER */ + +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t per_cpu_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} +static DEVICE_ATTR_RO(per_cpu); + +static ssize_t max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t max_active_store(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} +static DEVICE_ATTR_RW(max_active); + +static struct attribute *wq_sysfs_attrs[] = { + &dev_attr_per_cpu.attr, + &dev_attr_max_active.attr, + NULL, +}; +ATTRIBUTE_GROUPS(wq_sysfs); + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= MIN_NICE && attrs->nice <= MAX_NICE) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%*pb\n", + cpumask_pr_args(wq->unbound_attrs->cpumask)); + mutex_unlock(&wq->mutex); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; - /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); - mutex_unlock(&wq_pool_mutex); - break; - } - return NOTIFY_OK; + return written; } -/* - * Workqueues should be brought down after normal priority CPU notifiers. - * This will be registered as low priority CPU notifier. - */ -static int workqueue_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) { - int cpu = (unsigned long)hcpu; - struct work_struct unbind_work; - struct workqueue_struct *wq; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_PREPARE: - /* unbinding per-cpu workers should happen on the local CPU */ - INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); - queue_work_on(cpu, system_highpri_wq, &unbind_work); + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; - /* update NUMA affinity of unbound workqueues */ - mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); - mutex_unlock(&wq_pool_mutex); + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; - /* wait for per-cpu unbinding to finish */ - flush_work(&unbind_work); - destroy_work_on_stack(&unbind_work); - break; + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); } - return NOTIFY_OK; + + free_workqueue_attrs(attrs); + return ret ?: count; } -#ifdef CONFIG_SMP +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; -struct work_for_cpu { - struct work_struct work; - long (*fn)(void *); - void *arg; - long ret; +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_groups = wq_sysfs_groups, }; -static void work_for_cpu_fn(struct work_struct *work) +static int __init wq_sysfs_init(void) { - struct work_for_cpu *wfc = container_of(work, struct work_for_cpu, work); - - wfc->ret = wfc->fn(wfc->arg); + return subsys_virtual_register(&wq_subsys, NULL); } +core_initcall(wq_sysfs_init); -/** - * work_on_cpu - run a function in user context on a particular cpu - * @cpu: the cpu to run on - * @fn: the function to run - * @arg: the function arg - * - * It is up to the caller to ensure that the cpu doesn't go offline. - * The caller must not hold any locks which would prevent @fn from completing. - * - * Return: The value @fn returns. - */ -long work_on_cpu(int cpu, long (*fn)(void *), void *arg) +static void wq_device_release(struct device *dev) { - struct work_for_cpu wfc = { .fn = fn, .arg = arg }; + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); - INIT_WORK_ONSTACK(&wfc.work, work_for_cpu_fn); - schedule_work_on(cpu, &wfc.work); - flush_work(&wfc.work); - destroy_work_on_stack(&wfc.work); - return wfc.ret; + kfree(wq_dev); } -EXPORT_SYMBOL_GPL(work_on_cpu); -#endif /* CONFIG_SMP */ - -#ifdef CONFIG_FREEZER /** - * freeze_workqueues_begin - begin freezing workqueues + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register * - * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their delayed_works list instead of - * pool->worklist. + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. * - * CONTEXT: - * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Return: 0 on success, -errno on failure. */ -void freeze_workqueues_begin(void) +int workqueue_sysfs_register(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct pool_workqueue *pwq; - - mutex_lock(&wq_pool_mutex); + struct wq_device *wq_dev; + int ret; - WARN_ON_ONCE(workqueue_freezing); - workqueue_freezing = true; + /* + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. + */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; - list_for_each_entry(wq, &workqueues, list) { - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); - mutex_unlock(&wq->mutex); - } + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; - mutex_unlock(&wq_pool_mutex); -} + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; -/** - * freeze_workqueues_busy - are freezable workqueues still busy? - * - * Check whether freezing is complete. This function must be called - * between freeze_workqueues_begin() and thaw_workqueues(). - * - * CONTEXT: - * Grabs and releases wq_pool_mutex. - * - * Return: - * %true if some freezable workqueues are still busy. %false if freezing - * is complete. - */ -bool freeze_workqueues_busy(void) -{ - bool busy = false; - struct workqueue_struct *wq; - struct pool_workqueue *pwq; + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); - mutex_lock(&wq_pool_mutex); + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } - WARN_ON_ONCE(!workqueue_freezing); + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; - list_for_each_entry(wq, &workqueues, list) { - if (!(wq->flags & WQ_FREEZABLE)) - continue; - /* - * nr_active is monotonically decreasing. It's safe - * to peek without lock. - */ - rcu_read_lock_sched(); - for_each_pwq(pwq, wq) { - WARN_ON_ONCE(pwq->nr_active < 0); - if (pwq->nr_active) { - busy = true; - rcu_read_unlock_sched(); - goto out_unlock; + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; } } - rcu_read_unlock_sched(); } -out_unlock: - mutex_unlock(&wq_pool_mutex); - return busy; + + dev_set_uevent_suppress(&wq_dev->dev, false); + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; } /** - * thaw_workqueues - thaw workqueues - * - * Thaw workqueues. Normal queueing is restored and all collected - * frozen works are transferred to their respective pool worklists. + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister * - * CONTEXT: - * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. */ -void thaw_workqueues(void) +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { - struct workqueue_struct *wq; - struct pool_workqueue *pwq; - - mutex_lock(&wq_pool_mutex); - - if (!workqueue_freezing) - goto out_unlock; - - workqueue_freezing = false; + struct wq_device *wq_dev = wq->wq_dev; - /* restore max_active and repopulate worklist */ - list_for_each_entry(wq, &workqueues, list) { - mutex_lock(&wq->mutex); - for_each_pwq(pwq, wq) - pwq_adjust_max_active(pwq); - mutex_unlock(&wq->mutex); - } + if (!wq->wq_dev) + return; -out_unlock: - mutex_unlock(&wq_pool_mutex); + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); } -#endif /* CONFIG_FREEZER */ +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ static void __init wq_numa_init(void) { -- cgit v1.3-8-gc7d7 From f82daee49c09cf6a99c28303d93438a2566e5552 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 7 Apr 2015 01:07:39 +0200 Subject: Revert "PM / hibernate: avoid unsafe pages in e820 reserved regions" Commit 84c91b7ae07c (PM / hibernate: avoid unsafe pages in e820 reserved regions) is reported to make resume from hibernation on Lenovo x230 unreliable, so revert it. We will revisit the issue the commit in question was supposed to fix in the future. Link: https://bugzilla.kernel.org/show_bug.cgi?id=96111 Reported-by: rhn Cc: 3.17+ # 3.17+ Signed-off-by: Rafael J. Wysocki --- kernel/power/snapshot.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c24d5a23bf93..5235dd4e1e2f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -955,25 +955,6 @@ static void mark_nosave_pages(struct memory_bitmap *bm) } } -static bool is_nosave_page(unsigned long pfn) -{ - struct nosave_region *region; - - list_for_each_entry(region, &nosave_regions, list) { - if (pfn >= region->start_pfn && pfn < region->end_pfn) { - pr_err("PM: %#010llx in e820 nosave region: " - "[mem %#010llx-%#010llx]\n", - (unsigned long long) pfn << PAGE_SHIFT, - (unsigned long long) region->start_pfn << PAGE_SHIFT, - ((unsigned long long) region->end_pfn << PAGE_SHIFT) - - 1); - return true; - } - } - - return false; -} - /** * create_basic_memory_bitmaps - create bitmaps needed for marking page * frames that should not be saved and free page frames. The pointers @@ -2042,7 +2023,7 @@ static int mark_unsafe_pages(struct memory_bitmap *bm) do { pfn = memory_bm_next_pfn(bm); if (likely(pfn != BM_END_OF_MAP)) { - if (likely(pfn_valid(pfn)) && !is_nosave_page(pfn)) + if (likely(pfn_valid(pfn))) swsusp_set_page_free(pfn_to_page(pfn)); else return -EFAULT; -- cgit v1.3-8-gc7d7 From 6b79c57b92cdd90853002980609af516d14c4f9c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 7 Apr 2015 14:26:47 -0700 Subject: mm: numa: disable change protection for vma(VM_HUGETLB) Currently when a process accesses a hugetlb range protected with PROTNONE, unexpected COWs are triggered, which finally puts the hugetlb subsystem into a broken/uncontrollable state, where for example h->resv_huge_pages is subtracted too much and wraps around to a very large number, and the free hugepage pool is no longer maintainable. This patch simply stops changing protection for vma(VM_HUGETLB) to fix the problem. And this also allows us to avoid useless overhead of minor faults. Signed-off-by: Naoya Horiguchi Suggested-by: Mel Gorman Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: David Rientjes Cc: Rik van Riel Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched/fair.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bcfe32088b37..241213be507c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2165,8 +2165,10 @@ void task_numa_work(struct callback_head *work) vma = mm->mmap; } for (; vma; vma = vma->vm_next) { - if (!vma_migratable(vma) || !vma_policy_mof(vma)) + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma)) { continue; + } /* * Shared library pages mapped by multiple processes are not -- cgit v1.3-8-gc7d7 From 0c564a538aa934ad15b2145aaf8b64f3feb0be63 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 24 Mar 2015 17:58:09 -0400 Subject: tracing: Add TRACE_DEFINE_ENUM() macro to map enums to their values Several tracepoints use the helper functions __print_symbolic() or __print_flags() and pass in enums that do the mapping between the binary data stored and the value to print. This works well for reading the ASCII trace files, but when the data is read via userspace tools such as perf and trace-cmd, the conversion of the binary value to a human string format is lost if an enum is used, as userspace does not have access to what the ENUM is. For example, the tracepoint trace_tlb_flush() has: __print_symbolic(REC->reason, { TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" }, { TLB_REMOTE_SHOOTDOWN, "remote shootdown" }, { TLB_LOCAL_SHOOTDOWN, "local shootdown" }, { TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" }) Which maps the enum values to the strings they represent. But perf and trace-cmd do no know what value TLB_LOCAL_MM_SHOOTDOWN is, and would not be able to map it. With TRACE_DEFINE_ENUM(), developers can place these in the event header files and ftrace will convert the enums to their values: By adding: TRACE_DEFINE_ENUM(TLB_FLUSH_ON_TASK_SWITCH); TRACE_DEFINE_ENUM(TLB_REMOTE_SHOOTDOWN); TRACE_DEFINE_ENUM(TLB_LOCAL_SHOOTDOWN); TRACE_DEFINE_ENUM(TLB_LOCAL_MM_SHOOTDOWN); $ cat /sys/kernel/debug/tracing/events/tlb/tlb_flush/format [...] __print_symbolic(REC->reason, { 0, "flush on task switch" }, { 1, "remote shootdown" }, { 2, "local shootdown" }, { 3, "local mm shootdown" }) The above is what userspace expects to see, and tools do not need to be modified to parse them. Link: http://lkml.kernel.org/r/20150403013802.220157513@goodmis.org Cc: Guilherme Cox Cc: Tony Luck Cc: Xie XiuQi Acked-by: Namhyung Kim Reviewed-by: Masami Hiramatsu Tested-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- include/asm-generic/vmlinux.lds.h | 5 +- include/linux/ftrace_event.h | 2 +- include/linux/tracepoint.h | 8 +++ include/trace/ftrace.h | 22 ++++++- kernel/trace/trace.c | 26 ++++++++- kernel/trace/trace.h | 2 + kernel/trace/trace_events.c | 119 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 178 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index ac78910d7416..f8e8b34dc427 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -124,7 +124,10 @@ #define FTRACE_EVENTS() . = ALIGN(8); \ VMLINUX_SYMBOL(__start_ftrace_events) = .; \ *(_ftrace_events) \ - VMLINUX_SYMBOL(__stop_ftrace_events) = .; + VMLINUX_SYMBOL(__stop_ftrace_events) = .; \ + VMLINUX_SYMBOL(__start_ftrace_enum_maps) = .; \ + *(_ftrace_enum_map) \ + VMLINUX_SYMBOL(__stop_ftrace_enum_maps) = .; #else #define FTRACE_EVENTS() #endif diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 62b8fac7ded5..112cf49d9576 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -285,7 +285,7 @@ struct ftrace_event_call { struct tracepoint *tp; }; struct trace_event event; - const char *print_fmt; + char *print_fmt; struct event_filter *filter; void *mod; void *data; diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index c72851328ca9..a5f7f3ecafa3 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -36,6 +36,12 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct trace_enum_map { + const char *system; + const char *enum_string; + unsigned long enum_value; +}; + extern int tracepoint_probe_register(struct tracepoint *tp, void *probe, void *data); extern int @@ -87,6 +93,8 @@ extern void syscall_unregfunc(void); #define PARAMS(args...) args +#define TRACE_DEFINE_ENUM(x) + #endif /* _LINUX_TRACEPOINT_H */ /* diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 2f9b95b6d3fb..37d4b10b111d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -33,6 +33,19 @@ TRACE_MAKE_SYSTEM_STR(); +#undef TRACE_DEFINE_ENUM +#define TRACE_DEFINE_ENUM(a) \ + static struct trace_enum_map __used __initdata \ + __##TRACE_SYSTEM##_##a = \ + { \ + .system = TRACE_SYSTEM_STRING, \ + .enum_string = #a, \ + .enum_value = a \ + }; \ + static struct trace_enum_map __used \ + __attribute__((section("_ftrace_enum_map"))) \ + *TRACE_SYSTEM##_##a = &__##TRACE_SYSTEM##_##a + /* * DECLARE_EVENT_CLASS can be used to add a generic function * handlers for events. That is, if all events have the same @@ -136,6 +149,9 @@ TRACE_MAKE_SYSTEM_STR(); * The size of an array is also encoded, in the higher 16 bits of . */ +#undef TRACE_DEFINE_ENUM +#define TRACE_DEFINE_ENUM(a) + #undef __field #define __field(type, item) @@ -553,7 +569,7 @@ static inline notrace int ftrace_get_offsets_##call( \ * .trace = ftrace_raw_output_, <-- stage 2 * }; * - * static const char print_fmt_[] = ; + * static char print_fmt_[] = ; * * static struct ftrace_event_class __used event_class_